diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -440,15 +440,9 @@ /// The data for the single globalized variable. struct MappedVarData { /// Corresponding field in the global record. - const FieldDecl *FD = nullptr; + llvm::Value *GlobalizedVal = nullptr; /// Corresponding address. Address PrivateAddr = Address::invalid(); - /// true, if only one element is required (for latprivates in SPMD mode), - /// false, if need to create based on the warp-size. - bool IsOnePerTeam = false; - MappedVarData() = delete; - MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) - : FD(FD), IsOnePerTeam(IsOnePerTeam) {} }; /// The map of local variables to their addresses in the global memory. using DeclToAddrMapTy = llvm::MapVector; @@ -460,29 +454,12 @@ EscapedParamsTy EscapedParameters; llvm::SmallVector EscapedVariableLengthDecls; llvm::SmallVector EscapedVariableLengthDeclsAddrs; - const RecordDecl *GlobalRecord = nullptr; - llvm::Optional SecondaryGlobalRecord = llvm::None; - llvm::Value *GlobalRecordAddr = nullptr; llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr MappedParams; }; /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap FunctionGlobalizedDecls; - /// List of records for the globalized variables in target/teams/distribute - /// contexts. Inner records are going to be joined into the single record, - /// while those resulting records are going to be joined into the single - /// union. This resulting union (one per CU) is the entry point for the static - /// memory management runtime functions. - struct GlobalPtrSizeRecsTy { - llvm::GlobalVariable *UseSharedMemory = nullptr; - llvm::GlobalVariable *RecSize = nullptr; - llvm::GlobalVariable *Buffer = nullptr; - SourceLocation Loc; - llvm::SmallVector Records; - unsigned RegionCounter = 0; - }; - llvm::SmallVector GlobalizedRecords; llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; /// List of the records with the list of fields for the reductions across the /// teams. Used to build the intermediate buffer for the fast teams diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1102,17 +1102,6 @@ } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1164,10 +1153,6 @@ CGM.getModule(), OMPRTL___kmpc_kernel_init), Args); - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); - emitGenericVarsProlog(CGF, WST.Loc); } @@ -1236,17 +1221,6 @@ } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1268,12 +1242,6 @@ CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), Args); - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); - } - CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); @@ -1679,16 +1647,13 @@ static_cast(CGF.CGM.getOpenMPRuntime()); if (GlobalizedRD) { auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; I->getSecond().MappedParams = std::make_unique(); DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const auto &Pair : MappedDeclsFields) { assert(Pair.getFirst()->isCanonicalDecl() && "Expected canonical declaration"); - Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); + Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); } } Rt.emitGenericVarsProlog(CGF, Loc); @@ -1717,282 +1682,69 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return; - if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - QualType SecGlobalRecTy; - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - - llvm::PointerType *GlobalRecPtrTy = - CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); - llvm::Value *GlobalRecCastAddr; - llvm::Value *IsTTD = nullptr; - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - if (I->getSecond().SecondaryGlobalRecord.hasValue()) { - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_parallel_level), - {RTLoc, ThreadID}); - IsTTD = Bld.CreateIsNull(PL); - } - llvm::Value *IsSPMD = Bld.CreateIsNotNull( - CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), - CharUnits::fromQuantity(Alignment)); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); - if (const RecordDecl *SecGlobalizedVarsRecord = - I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { - SecGlobalRecTy = - CGM.getContext().getRecordType(SecGlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - Size = Bld.CreateSelect( - IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); - } - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, - /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); - GlobalRecCastAddr = Phi; - I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().Records.size() < 2 && - "Expected less than 2 globalized records: one for target and one " - "for teams."); - unsigned Offset = 0; - for (const RecordDecl *RD : GlobalizedRecords.back().Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); - unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); - Offset = - llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); - } - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - Offset = llvm::alignTo(Offset, Alignment); - GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); - ++GlobalizedRecords.back().RegionCounter; - if (GlobalizedRecords.back().Records.size() == 1) { - assert(KernelStaticGlobalized && - "Kernel static pointer must be initialized already."); - auto *UseSharedMemory = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$is_shared"); - UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, Loc); - auto *StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, nullptr); - auto *RecSize = new llvm::GlobalVariable( - CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$size"); - RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, - CGM.getContext().getSizeType(), Loc); - llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), - GlobalRecordSizeArg); - GlobalizedRecords.back().Buffer = StaticGlobalized; - GlobalizedRecords.back().RecSize = RecSize; - GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; - GlobalizedRecords.back().Loc = Loc; - } - assert(KernelStaticGlobalized && "Global address must be set already."); - Address FrameAddr = CGF.EmitLoadOfPointer( - Address(KernelStaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs()); - llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); - } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - bool UseSharedMemory = - IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), - IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack - : OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; + for (auto &Rec : I->getSecond().LocalVarData) { + const auto *VD = cast(Rec.first); + bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); + QualType VarTy = VD->getType(); + + // Get the local allocation of a firstprivate variable before sharing + llvm::Value *ParValue; + if (EscapedParam) { + LValue ParLVal = + CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); + ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); } - LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); - - // Emit the "global alloca" which is a GEP from the global declaration - // record using the pointer returned by the runtime. - LValue SecBase; - decltype(I->getSecond().LocalVarData)::const_iterator SecIt; - if (IsTTD) { - SecIt = I->getSecond().SecondaryLocalVarData->begin(); - llvm::PointerType *SecGlobalRecPtrTy = - CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); - SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( - Bld.CreatePointerBitCastOrAddrSpaceCast( - I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), - SecGlobalRecTy); - } - for (auto &Rec : I->getSecond().LocalVarData) { - bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); - llvm::Value *ParValue; - if (EscapedParam) { - const auto *VD = cast(Rec.first); - LValue ParLVal = - CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); - ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); - } - LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); - // Emit VarAddr basing on lane-id if required. - QualType VarTy; - if (Rec.second.IsOnePerTeam) { - VarTy = Rec.second.FD->getType(); - } else { - Address Addr = VarAddr.getAddress(CGF); - llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( - Addr.getElementType(), Addr.getPointer(), - {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - VarTy = - Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue( - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, - AlignmentSource::Decl); - } - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - assert(I->getSecond().IsInSPMDModeFlag && - "Expected unknown execution mode or required SPMD check."); - if (IsTTD) { - assert(SecIt->second.IsOnePerTeam && - "Secondary glob data must be one per team."); - LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); - VarAddr.setAddress( - Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), - VarAddr.getPointer(CGF)), - VarAddr.getAlignment())); - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - } - Address GlobalPtr = Rec.second.PrivateAddr; - Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); - Rec.second.PrivateAddr = Address( - Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, - LocalAddr.getPointer(), GlobalPtr.getPointer()), - LocalAddr.getAlignment()); - } - if (EscapedParam) { - const auto *VD = cast(Rec.first); - CGF.EmitStoreOfScalar(ParValue, VarAddr); - I->getSecond().MappedParams->setVarAddr(CGF, VD, - VarAddr.getAddress(CGF)); - } - if (IsTTD) - ++SecIt; + + // Allocate space for the variable to be globalized + llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; + llvm::Instruction *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); + + // Cast the void pointer and get the address of the globalized variable. + llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); + llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); + LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy); + Rec.second.PrivateAddr = VarAddr.getAddress(CGF); + Rec.second.GlobalizedVal = VoidPtr; + + // Assign the local allocation to the newly globalized location. + if (EscapedParam) { + CGF.EmitStoreOfScalar(ParValue, VarAddr); + I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF)); } + if (auto *DI = CGF.getDebugInfo()) + VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); } - for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding + for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) { + // Use actual memory size of the VLA object including the padding // for alignment purposes. - CGBuilderTy &Bld = CGF.Builder; llvm::Value *Size = CGF.getTypeSize(VD->getType()); CharUnits Align = CGM.getContext().getDeclAlign(VD); Size = Bld.CreateNUWAdd( Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); llvm::Value *AlignVal = llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); + Size = Bld.CreateUDiv(Size, AlignVal); Size = Bld.CreateNUWMul(Size, AlignVal); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); - LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), + + // Allocate space for this VLA object to be globalized. + llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; + llvm::Instruction *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); + + I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr); + LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(), CGM.getContext().getDeclAlign(VD), AlignmentSource::Decl); I->getSecond().MappedParams->setVarAddr(CGF, cast(VD), Base.getAddress(CGF)); - I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); } I->getSecond().MappedParams->apply(CGF); } @@ -2005,60 +1757,20 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { - I->getSecond().MappedParams->restore(CGF); - if (!CGF.HaveInsertPoint()) - return; + // Deallocate the memory for each globalized VLA object for (llvm::Value *Addr : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - Addr); + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + Addr); } - if (I->getSecond().GlobalRecordAddr) { - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().RegionCounter > 0 && - "region counter must be > 0."); - --GlobalizedRecords.back().RegionCounter; - // Emit the restore function only in the target region. - if (GlobalizedRecords.back().RegionCounter == 0) { - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(GlobalizedRecords.back().UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); - llvm::Value *Args[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - IsInSharedMemory}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), - Args); - } - } else { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); - } + // Deallocate the memory for each globalized value + for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { + I->getSecond().MappedParams->restore(CGF); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal}); } } } @@ -4183,7 +3895,6 @@ auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().MappedParams = std::make_unique(); - I->getSecond().GlobalRecord = GlobalizedVarsRecord; I->getSecond().EscapedParameters.insert( VarChecker.getEscapedParameters().begin(), VarChecker.getEscapedParameters().end()); @@ -4192,21 +3903,16 @@ DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); + Data.insert(std::make_pair(VD, MappedVarData())); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); - I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); I->getSecond().SecondaryLocalVarData.emplace(); DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); + Data.insert(std::make_pair(VD, MappedVarData())); } } if (!NeedToDelayGlobalization) { @@ -4499,187 +4205,8 @@ CGOpenMPRuntime::processRequiresDirective(D); } -/// Get number of SMs and number of blocks per SM. -static std::pair getSMsBlocksPerSM(CodeGenModule &CGM) { - std::pair Data; - if (CGM.getLangOpts().OpenMPCUDANumSMs) - Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; - if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) - Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; - if (Data.first && Data.second) - return Data; - switch (getCudaArch(CGM)) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - return {16, 16}; - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return {56, 32}; - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::SM_86: - return {84, 32}; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX602: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX705: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX805: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX90a: - case CudaArch::GFX90c: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1013: - case CudaArch::GFX1030: - case CudaArch::GFX1031: - case CudaArch::GFX1032: - case CudaArch::GFX1033: - case CudaArch::GFX1034: - case CudaArch::UNUSED: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - llvm_unreachable("Unexpected NVPTX target without ptx feature."); -} - void CGOpenMPRuntimeGPU::clear() { - if (!GlobalizedRecords.empty() && - !CGM.getLangOpts().OpenMPCUDATargetParallel) { - ASTContext &C = CGM.getContext(); - llvm::SmallVector GlobalRecs; - llvm::SmallVector SharedRecs; - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - RecordDecl *SharedStaticRD = C.buildImplicitRecord( - "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - SharedStaticRD->startDefinition(); - for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { - if (Records.Records.empty()) - continue; - unsigned Size = 0; - unsigned RecAlignment = 0; - for (const RecordDecl *RD : Records.Records) { - QualType RDTy = C.getRecordType(RD); - unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); - RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); - Size = - llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); - } - Size = llvm::alignTo(Size, RecAlignment); - llvm::APInt ArySize(/*numBits=*/64, Size); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - const bool UseSharedMemory = Size <= SharedMemorySize; - auto *Field = - FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, - SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (UseSharedMemory) { - SharedStaticRD->addDecl(Field); - SharedRecs.push_back(&Records); - } else { - StaticRD->addDecl(Field); - GlobalRecs.push_back(&Records); - } - Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); - Records.UseSharedMemory->setInitializer( - llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); - } - // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. - if (!SharedStaticRD->field_empty()) { - llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - SharedStaticRD->addDecl(Field); - } - SharedStaticRD->completeDefinition(); - if (!SharedStaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(SharedStaticRD); - llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, - llvm::UndefValue::get(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - StaticRD->completeDefinition(); - if (!StaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(StaticRD); - std::pair SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = - C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - } + if (!TeamsReductions.empty()) { ASTContext &C = CGM.getContext(); RecordDecl *StaticRD = C.buildImplicitRecord( diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp --- a/clang/test/OpenMP/assumes_include_nvptx.cpp +++ b/clang/test/OpenMP/assumes_include_nvptx.cpp @@ -19,8 +19,6 @@ // CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() [[attr1]] // CHECK: declare void @__kmpc_kernel_init(i32, i16) // CHECK-NOT: # -// CHECK: declare void @__kmpc_data_sharing_init_stack() -// CHECK-NOT: # // CHECK: declare float @_Z3sinf(float) [[attr2:#[0-9]*]] // CHECK: declare void @__kmpc_kernel_deinit(i16) // CHECK-NOT: # diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -33,7 +33,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -81,39 +80,9 @@ // CHECK1-LABEL: define {{[^@]+}}@_Z3barv // CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[A2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK1-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) -// CHECK1-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK1-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK1: .spmd: -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .non-spmd: -// CHECK1-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i64 4, i64 128 -// CHECK1-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[TMP5]], i16 0) -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK1-NEXT: [[A1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[A1]], i32* [[TMP9]] -// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[A2]], i32* [[TMP10]] -// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[TMP11]]) #[[ATTR4]] -// CHECK1-NEXT: store i32 [[CALL]], i32* [[RETVAL]], align 4 -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTEXIT4:%.*]], label [[DOTNON_SPMD3:%.*]] -// CHECK1: .non-spmd3: -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP12]]) -// CHECK1-NEXT: br label [[DOTEXIT4]] -// CHECK1: .exit4: -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK1-NEXT: ret i32 [[TMP13]] +// CHECK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32* +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A]]) +// CHECK1-NEXT: ret i32 [[CALL]] // diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -3,8 +3,7 @@ ///==========================================================================/// // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK // expected-no-diagnostics @@ -78,8 +77,6 @@ // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 // CHECK1-SAME: () #[[ATTR1:[0-9]+]] { // CHECK1-NEXT: entry: @@ -145,8 +142,6 @@ // CHECK1-NEXT: br label [[DOTEXIT]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: @@ -159,8 +154,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK1-NEXT: store i32 1000, i32* [[TMP0]], align 4 // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -178,8 +171,6 @@ // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: @@ -200,8 +191,6 @@ // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 // CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -222,8 +211,6 @@ // CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -271,8 +258,6 @@ // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15 // CHECK2-SAME: () #[[ATTR1:[0-9]+]] { // CHECK2-NEXT: entry: @@ -333,8 +318,6 @@ // CHECK2-NEXT: br label [[DOTEXIT]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: @@ -347,8 +330,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK2-NEXT: store i32 1000, i32* [[TMP0]], align 4 // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -366,8 +347,6 @@ // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: @@ -388,8 +367,6 @@ // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 // CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -410,4 +387,190 @@ // CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK2-NEXT: ret void +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_worker +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK: .await.work: +// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK: .select.workers: +// CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK: .execute.parallel: +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK: .execute.fn: +// CHECK-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK: .check.next: +// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK: .execute.fn2: +// CHECK-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK: .check.next3: +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]]) +// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK: .terminate.parallel: +// CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK: .barrier.parallel: +// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14 +// CHECK-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [2 x i8*], align 8 +// CHECK-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK: .worker: +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_worker() #[[ATTR3]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .mastercheck: +// CHECK-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK: .master: +// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32* +// CHECK-NEXT: [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK-NEXT: [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32* +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK-NEXT: store i32 10, i32* [[A_ON_STACK]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP8]], i64 1) +// CHECK-NEXT: store i32 100, i32* [[B_ON_STACK]], align 4 +// CHECK-NEXT: store i32 1000, i32* [[C]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 0 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[B_ON_STACK]] to i8* +// CHECK-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 1 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP13]], i64 2) +// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[B]]) +// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[A]]) +// CHECK-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK: .termination.notifier: +// CHECK-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK-NEXT: br label [[DOTEXIT]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: store i32 1000, i32* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[C1:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: store i32* [[C]], i32** [[C1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000 +// CHECK-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -1,22 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6 - // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK7 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK8 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK9 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK10 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK11 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK12 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK6 // expected-no-diagnostics #ifndef HEADER @@ -75,8 +63,6 @@ // CHECK1-NEXT: br label [[DOTEXIT:%.*]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -247,8 +233,6 @@ // CHECK1-NEXT: [[TMP64:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP64]]) // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -376,8 +360,6 @@ // CHECK1-NEXT: br label [[OMP_PRECOND_END]] // CHECK1: omp.precond.end: // CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -417,8 +399,6 @@ // CHECK2-NEXT: br label [[DOTEXIT:%.*]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -584,8 +564,6 @@ // CHECK2: omp.precond.end: // CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -713,8 +691,6 @@ // CHECK2-NEXT: br label [[OMP_PRECOND_END]] // CHECK2: omp.precond.end: // CHECK2-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -752,8 +728,6 @@ // CHECK3-NEXT: br label [[DOTEXIT:%.*]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -921,8 +895,6 @@ // CHECK3-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1044,991 +1016,6 @@ // CHECK3-NEXT: br label [[OMP_PRECOND_END]] // CHECK3: omp.precond.end: // CHECK3-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP5]], i16 [[TMP4]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0 -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP8]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 40, i1 false) -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP15:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP16]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP19]], [[COND_TRUE]] ], [ [[TMP20]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP22]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP25]] to i8* -// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK4-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK4-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK4-NEXT: [[TMP35:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK4-NEXT: store i8* [[TMP35]], i8** [[TMP34]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK4-NEXT: [[TMP37:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK4-NEXT: store i8* [[TMP37]], i8** [[TMP36]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK4-NEXT: [[TMP39:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK4-NEXT: store i8* [[TMP39]], i8** [[TMP38]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK4-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP40]], 0 -// CHECK4-NEXT: [[TMP41:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK4-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP43]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP44]], i32 7) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP45]], [[TMP46]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP47]], [[TMP48]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP49]], [[TMP50]] -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]] -// CHECK4-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK4: cond.true12: -// CHECK4-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: br label [[COND_END14:%.*]] -// CHECK4: cond.false13: -// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END14]] -// CHECK4: cond.end14: -// CHECK4-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP53]], [[COND_TRUE12]] ], [ [[TMP54]], [[COND_FALSE13]] ] -// CHECK4-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP56:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP57]]) -// CHECK4-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 -// CHECK4-NEXT: br i1 [[TMP59]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP60:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK4-NEXT: [[TMP61:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP60]], i8* align 4 [[TMP61]], i32 40, i1 false) -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK4-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK4-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK4-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK4-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK4-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK4-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK5-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK5-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK5-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK5-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK5-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK5-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK5-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK5-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK5: cond.true12: -// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK5-NEXT: br label [[COND_END14:%.*]] -// CHECK5: cond.false13: -// CHECK5-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END14]] -// CHECK5: cond.end14: -// CHECK5-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK5-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK5-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK5-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK5-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK5-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK5-NEXT: ret void -// -// -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK5-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK5: omp.precond.then: -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK5-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK5-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK5-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK5-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK5-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK5-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK5-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK5: .omp.lastprivate.then: -// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK5: .omp.lastprivate.done: -// CHECK5-NEXT: br label [[OMP_PRECOND_END]] -// CHECK5: omp.precond.end: -// CHECK5-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK6-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK6-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK6-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK6-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK6-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK6-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK6-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK6-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK6: cond.true12: -// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK6-NEXT: br label [[COND_END14:%.*]] -// CHECK6: cond.false13: -// CHECK6-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END14]] -// CHECK6: cond.end14: -// CHECK6-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK6-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK6-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK6-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK6-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK6-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK6-NEXT: ret void -// -// -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK6-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK6: omp.precond.then: -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK6-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK6-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK6-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK6-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK6-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK6-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK6: .omp.lastprivate.then: -// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK6: .omp.lastprivate.done: -// CHECK6-NEXT: br label [[OMP_PRECOND_END]] -// CHECK6: omp.precond.end: -// CHECK6-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -2068,8 +1055,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -2240,8 +1225,6 @@ // CHECK7-NEXT: [[TMP64:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP64]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -2369,8 +1352,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK8-NEXT: entry: @@ -2410,8 +1391,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -2577,8 +1556,6 @@ // CHECK8: omp.precond.end: // CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -2706,8 +1683,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -2745,8 +1720,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -2914,8 +1887,6 @@ // CHECK9-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -3037,8 +2008,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: @@ -3076,8 +2045,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -3245,8 +2212,6 @@ // CHECK10-NEXT: [[TMP62:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP62]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -3368,656 +2333,1628 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 // CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 // CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK11-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK11-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK11-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK11-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK11-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK11-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK11-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK11: cond.true12: +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END14:%.*]] +// CHECK11: cond.false13: +// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END14]] +// CHECK11: cond.end14: +// CHECK11-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK11-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK11-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK11-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK11-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 // CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 // CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: +// CHECK11-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK11-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK11-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK11-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK11-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK11-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK11-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK11-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK11-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK11-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: // CHECK11-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) +// CHECK12-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 +// CHECK12-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK12-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK12-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK12-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK12-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK12-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK12-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK12-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK12: cond.true12: +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END14:%.*]] +// CHECK12: cond.false13: +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END14]] +// CHECK12: cond.end14: +// CHECK12-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK12-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK12-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK12-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK12-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK12-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK12-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK12-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK12-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK12-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK12-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK12-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK12-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK12-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[ARGC_CASTED]] to i32* +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[CONV1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK11-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK11-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK11-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK11-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK11-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK11-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK11-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK11-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK11-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK11-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK11-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK11-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK11-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK11-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK11-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK11-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK11-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK11-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK11: cond.true12: -// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END14:%.*]] -// CHECK11: cond.false13: -// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END14]] -// CHECK11: cond.end14: -// CHECK11-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK11-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK11-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK11-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK11-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i64 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 8 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 40) +// CHECK4-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false) +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast i32* [[CONV]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK4-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 +// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 5 +// CHECK4-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK4-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 +// CHECK4-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 6 +// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK4-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK4-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 +// CHECK4-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i64 7) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] +// CHECK4-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK4: cond.true12: +// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END14:%.*]] +// CHECK4: cond.false13: +// CHECK4-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END14]] +// CHECK4: cond.end14: +// CHECK4-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] +// CHECK4-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) +// CHECK4-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 +// CHECK4-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK4-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 40, i1 false) +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK4-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK11-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK11-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK11-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK11-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK11-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK11-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK11-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK11-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK11-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK11-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK11-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK11-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK11-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK11-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK11-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[C5:%.*]] = alloca [10 x i32], align 4 +// CHECK4-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 8 +// CHECK4-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP10]] to i32 +// CHECK4-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i64 40, i1 false) +// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp ule i64 [[CONV7]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR5:[0-9]+]] +// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] +// CHECK4-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] +// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK4-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 +// CHECK4-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] +// CHECK4-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR5]] +// CHECK4-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] +// CHECK4-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD21]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK4-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK4-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C5]] to i8* +// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i64 40, i1 false) +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19 +// CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// +// +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40) +// CHECK5-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK5-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK5-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK5-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK5-NEXT: [[TMP30:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK5-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK5-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK5-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK5-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP35]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK5-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK5-NEXT: [[TMP39:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 [[TMP36]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i32 7) +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK5-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK5-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK5: cond.true12: +// CHECK5-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK5-NEXT: br label [[COND_END14:%.*]] +// CHECK5: cond.false13: +// CHECK5-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END14]] +// CHECK5: cond.end14: +// CHECK5-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE12]] ], [ [[TMP49]], [[COND_FALSE13]] ] +// CHECK5-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK5-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK5-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK5-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP55:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP55]], i8* align 4 [[TMP56]], i32 40, i1 false) +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK5-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31 -// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK5-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK5-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK5-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK5-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK5-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK5: omp.precond.then: +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK5-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK5-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK5-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK5-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK5-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK5-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK5-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK5: .omp.lastprivate.then: +// CHECK5-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK5-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK5-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK5: .omp.lastprivate.done: +// CHECK5-NEXT: br label [[OMP_PRECOND_END]] +// CHECK5: omp.precond.end: +// CHECK5-NEXT: ret void +// +// +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19 +// CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP6]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 -// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 40, i16 1) -// CHECK12-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK12-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP10]], i8* align 4 [[TMP11]], i32 40, i1 false) -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP12:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP18]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK12-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP19]], [[ADD]] -// CHECK12-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK12-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK12-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* -// CHECK12-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK12-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP2]] to i8* -// CHECK12-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK12-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK12-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[B4]] to i8* -// CHECK12-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK12-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 -// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK12-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK12-NEXT: [[TMP35:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 -// CHECK12-NEXT: [[TMP36:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK12-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 4 -// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK12-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP37]], 0 -// CHECK12-NEXT: [[TMP38:%.*]] = zext i1 [[TOBOOL]] to i32 -// CHECK12-NEXT: [[TMP39:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4 -// CHECK12-NEXT: [[TMP41:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP40]], i32 [[TMP38]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP41]], i32 7) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP46]], [[TMP47]] -// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]] -// CHECK12-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] -// CHECK12: cond.true12: -// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END14:%.*]] -// CHECK12: cond.false13: -// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END14]] -// CHECK12: cond.end14: -// CHECK12-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP50]], [[COND_TRUE12]] ], [ [[TMP51]], [[COND_FALSE13]] ] -// CHECK12-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP52]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP53:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP54]]) -// CHECK12-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP56:%.*]] = icmp ne i32 [[TMP55]], 0 -// CHECK12-NEXT: br i1 [[TMP56]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP57:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* -// CHECK12-NEXT: [[TMP58:%.*]] = bitcast [10 x i32]* [[C1]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i32 40, i1 false) -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP4]]) -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i32 [[ARGC:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B4:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [7 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 40) +// CHECK6-NEXT: [[C_ON_STACK:%.*]] = bitcast i8* [[C1]] to [10 x i32]* +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK6-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK6-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARGC_ADDR]] to i8* +// CHECK6-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP2]] to i8* +// CHECK6-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP29:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK6-NEXT: [[TMP30:%.*]] = bitcast [10 x i32]* [[B4]] to i8* +// CHECK6-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 5 +// CHECK6-NEXT: [[TMP32:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK6-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr inbounds [7 x i8*], [7 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 6 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK6-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP35]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = zext i1 [[TOBOOL]] to i32 +// CHECK6-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 +// CHECK6-NEXT: [[TMP39:%.*]] = bitcast [7 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 [[TMP36]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32*, i32*, [10 x i32]*, [10 x i32]*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i32 7) +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK6-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] +// CHECK6-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] +// CHECK6-NEXT: br i1 [[CMP11]], label [[COND_TRUE12:%.*]], label [[COND_FALSE13:%.*]] +// CHECK6: cond.true12: +// CHECK6-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK6-NEXT: br label [[COND_END14:%.*]] +// CHECK6: cond.false13: +// CHECK6-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END14]] +// CHECK6: cond.end14: +// CHECK6-NEXT: [[COND15:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE12]] ], [ [[TMP49]], [[COND_FALSE13]] ] +// CHECK6-NEXT: store i32 [[COND15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK6-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) +// CHECK6-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 +// CHECK6-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP55:%.*]] = bitcast [10 x i32]* [[TMP1]] to i8* +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast [10 x i32]* [[C_ON_STACK]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP55]], i8* align 4 [[TMP56]], i32 40, i1 false) +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK6-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 -// CHECK12-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 -// CHECK12-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* -// CHECK12-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) -// CHECK12-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK12-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] -// CHECK12-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK12-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK12-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK12-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK12-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK12-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] -// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] -// CHECK12-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK12-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) -// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 -// CHECK12-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* -// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* -// CHECK12-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[C:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B3:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[C4:%.*]] = alloca [10 x i32], align 4 +// CHECK6-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[C]], [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: store [10 x i32]* [[D]], [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK6-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK6-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK6-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK6-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK6: omp.precond.then: +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[B3]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP11]], i8* align 4 [[TMP12]], i32 40, i1 false) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP14]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK6-NEXT: [[CMP6:%.*]] = icmp ule i32 [[TMP16]], [[TMP17]] +// CHECK6-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR5:[0-9]+]] +// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] +// CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] +// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] +// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK6-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] +// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR5]] +// CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] +// CHECK6-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK6-NEXT: store i32 [[ADD17]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]]) +// CHECK6-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CHECK6-NEXT: br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK6: .omp.lastprivate.then: +// CHECK6-NEXT: [[TMP28:%.*]] = bitcast [10 x i32]* [[TMP3]] to i8* +// CHECK6-NEXT: [[TMP29:%.*]] = bitcast [10 x i32]* [[C4]] to i8* +// CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP28]], i8* align 4 [[TMP29]], i32 40, i1 false) +// CHECK6-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK6: .omp.lastprivate.done: +// CHECK6-NEXT: br label [[OMP_PRECOND_END]] +// CHECK6: omp.precond.end: +// CHECK6-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -830,7 +830,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP7:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast %class.anon* [[L7]] to i8* // CHECK2-NEXT: [[TMP9:%.*]] = bitcast %class.anon* [[TMP7]] to i8* @@ -877,7 +876,6 @@ // CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -1019,7 +1017,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP9:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8 // CHECK2-NEXT: [[TMP10:%.*]] = bitcast %class.anon.0* [[L9]] to i8* // CHECK2-NEXT: [[TMP11:%.*]] = bitcast %class.anon.0* [[TMP9]] to i8* @@ -1086,7 +1083,6 @@ // CHECK2-NEXT: store %class.anon.0* [[TMP4]], %class.anon.0** [[_TMP2]], align 8 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1206,7 +1202,6 @@ // CHECK2-NEXT: store %class.anon* [[TMP0]], %class.anon** [[TMP]], align 8 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1340,7 +1335,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 // CHECK3-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8* // CHECK3-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8* @@ -1407,7 +1401,6 @@ // CHECK3-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -1588,7 +1581,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8* // CHECK3-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8* @@ -1635,7 +1627,6 @@ // CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1697,7 +1688,6 @@ // CHECK3-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -1831,7 +1821,6 @@ // CHECK4-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK4-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK4-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8 // CHECK4-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8* // CHECK4-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8* @@ -1898,7 +1887,6 @@ // CHECK4-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -2079,7 +2067,6 @@ // CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK4-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8 // CHECK4-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8* // CHECK4-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8* @@ -2126,7 +2113,6 @@ // CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) @@ -2188,7 +2174,6 @@ // CHECK4-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) diff --git a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp @@ -31,7 +31,6 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -128,7 +127,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: call void @_Z3usev() #[[ATTR7]] // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK1: .termination.notifier: @@ -171,7 +169,6 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -268,7 +265,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: call void @_Z3usev() #[[ATTR7]] // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK2: .termination.notifier: @@ -311,7 +307,6 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -408,7 +403,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: call void @_Z3usev() #[[ATTR7]] // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK3: .termination.notifier: diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp @@ -106,7 +106,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] // CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) @@ -305,7 +304,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] // CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) @@ -504,7 +502,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] // CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -2,11 +2,8 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK5 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -aux-triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -disable-O0-optnone | FileCheck %s --check-prefix=CHECK2 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -76,989 +73,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK1: .execute.fn2: -// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .check.next3: -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK1: .execute.fn5: -// CHECK1-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .check.next6: -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 42, i32* [[A]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 43, i32* [[A]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 -// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 -// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 -// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 -// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store i32 [[TMP10]], i32* [[A7]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK1-NEXT: store i8* [[TMP13]], i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP14]], i64 1) -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK1-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP16]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK1: omp.critical.loop: -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK1: omp.critical.test: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK1: omp.critical.body: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK1: omp.critical.sync: -// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK1: omp.critical.exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK2: .execute.fn2: -// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .check.next3: -// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK2: .execute.fn5: -// CHECK2-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .check.next6: -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK2-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 42, i32* [[A]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 43, i32* [[A]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK2-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 -// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK2-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 -// CHECK2-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 -// CHECK2-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK2-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i64 1) -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK2: omp.critical.loop: -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK2: omp.critical.test: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK2: omp.critical.body: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK2: omp.critical.sync: -// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK2: omp.critical.exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1113,8 +127,6 @@ // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1165,8 +177,6 @@ // CHECK3-NEXT: br label [[DOTEXIT]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1177,8 +187,6 @@ // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK3-NEXT: store i32 42, i32* [[A]], align 4 // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1192,8 +200,6 @@ // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1204,8 +210,6 @@ // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK3-NEXT: store i32 43, i32* [[A]], align 4 // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1219,8 +223,6 @@ // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1231,8 +233,6 @@ // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK3-NEXT: store i32 44, i32* [[A]], align 4 // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1246,8 +246,6 @@ // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker // CHECK3-SAME: () #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1288,8 +286,6 @@ // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 // CHECK3-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1354,8 +350,6 @@ // CHECK3-NEXT: br label [[DOTEXIT]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1369,8 +363,6 @@ // CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 // CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1384,8 +376,6 @@ // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker // CHECK3-SAME: () #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1426,8 +416,6 @@ // CHECK3-NEXT: br label [[DOTAWAIT_WORK]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 // CHECK3-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1486,8 +474,6 @@ // CHECK3-NEXT: br label [[DOTEXIT]] // CHECK3: .exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1528,8 +514,6 @@ // CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]] // CHECK3: omp.critical.exit: // CHECK3-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: @@ -1547,8 +531,6 @@ // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 // CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] // CHECK3-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -1603,8 +585,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 // CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK4-NEXT: entry: @@ -1655,8 +635,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -1667,8 +645,6 @@ // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK4-NEXT: store i32 42, i32* [[A]], align 4 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -1682,21 +658,316 @@ // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK4: .execute.fn: +// CHECK4-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .check.next: +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK4-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK4-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK4-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK4: .execute.fn: +// CHECK4-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .check.next: +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 // CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK4: omp.critical.loop: +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK4-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK4: omp.critical.test: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK4-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK4: omp.critical.body: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK4: omp.critical.sync: +// CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK4: omp.critical.exit: // CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper // CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 @@ -1707,814 +978,1427 @@ // CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 // CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 // CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] // CHECK4-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK5: .execute.fn2: +// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .check.next3: +// CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK5: .execute.fn5: +// CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .check.next6: +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 +// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 +// CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 +// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK5: omp.critical.loop: +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK5: omp.critical.test: +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK5: omp.critical.body: +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK5: omp.critical.sync: +// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK5: omp.critical.exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK1: .execute.fn2: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next3: +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK1: .execute.fn5: +// CHECK1-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .check.next6: +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i64 0) +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i64 0) +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i64 0) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK4-SAME: () #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK4: .await.work: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK4: .select.workers: -// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK4: .execute.parallel: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK4: .execute.fn: -// CHECK4-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK4: .check.next: -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK4: .terminate.parallel: -// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK4: .barrier.parallel: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK4-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK4-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK4-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK4: .worker: -// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK4-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK4-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK4-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK4-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK4-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK4-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 -// CHECK4-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 -// CHECK4-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 -// CHECK4-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK4: .termination.notifier: -// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTEXIT]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK4-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK4-SAME: () #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK4: .await.work: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK4: .select.workers: -// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK4: .execute.parallel: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK4-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK4: .execute.fn: -// CHECK4-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK4: .check.next: -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK4-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK4: .terminate.parallel: -// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK4: .barrier.parallel: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK4-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK4: .worker: -// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK4-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK4: .termination.notifier: -// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTEXIT]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK1-SAME: (i64 [[N:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i64 0) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV2]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1 +// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16 +// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV2]], align 8 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK4: omp.critical.loop: -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK4-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK4: omp.critical.test: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK4-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK4: omp.critical.body: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK4: omp.critical.sync: -// CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK4: omp.critical.exit: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker -// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] -// CHECK5: .execute.fn2: -// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .check.next3: -// CHECK5-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] -// CHECK5: .execute.fn5: -// CHECK5-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .check.next6: -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29 -// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i64 1) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 42, i32* [[A]], align 4 -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK1: omp.critical.loop: +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK1: omp.critical.test: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK1: omp.critical.body: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK1: omp.critical.sync: +// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK1: omp.critical.exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 43, i32* [[A]], align 4 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]] +// CHECK2: .execute.fn2: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next3: +// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH4:%.*]] = icmp eq i8* [[TMP7]], bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH4]], label [[DOTEXECUTE_FN5:%.*]], label [[DOTCHECK_NEXT6:%.*]] +// CHECK2: .execute.fn5: +// CHECK2-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .check.next6: +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP8]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 +// CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS8:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP6]], i32 0) +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 0, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP7]], i32 0) +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS8]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** [[TMP8]], i32 0) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 44, i32* [[A]], align 4 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 42, i32* [[A]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 43, i32* [[A]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46 -// CHECK5-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK5-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 -// CHECK5-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK5-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 -// CHECK5-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK5-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 -// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 -// CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 -// CHECK5-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 -// CHECK5-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 -// CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 44, i32* [[A]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32 45, i32* [[A]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK2-SAME: (i32 [[N:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 1000 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i1 [[CMP]] to i32 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 [[TMP8]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP9]], i32 0) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i16, i16* [[CONV]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1 +// CHECK2-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16 +// CHECK2-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58 -// CHECK5-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l58_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: [[A7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: store i32 [[TMP7]], i32* [[A7]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A7]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP11]], i32 1) -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[A7]], align 4 -// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP12]], 1 -// CHECK5-NEXT: store i32 [[INC]], i32* [[A7]], align 4 -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 45, i32* [[A]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]]) +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] -// CHECK5: omp.critical.loop: -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] -// CHECK5-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] -// CHECK5: omp.critical.test: -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] -// CHECK5-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] -// CHECK5: omp.critical.body: -// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK5-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") -// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] -// CHECK5: omp.critical.sync: -// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) -// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 -// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] -// CHECK5: omp.critical.exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__4_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK2-SAME: (i32 [[A:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_worker() #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: [[A7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A7]] to i32* +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[A_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__4 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__4_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[A7]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[CRITICAL_COUNTER:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] +// CHECK2: omp.critical.loop: +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]] +// CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]] +// CHECK2: omp.critical.test: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]] +// CHECK2: omp.critical.body: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") +// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] +// CHECK2: omp.critical.sync: +// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 +// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]] +// CHECK2: omp.critical.exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR2]] +// CHECK2-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -1,8 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -73,8 +72,6 @@ // CHECK1-NEXT: br label [[DOTAWAIT_WORK]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14 // CHECK1-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK1-NEXT: entry: @@ -141,8 +138,6 @@ // CHECK1-NEXT: br label [[DOTEXIT]] // CHECK1: .exit: // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: @@ -231,8 +226,6 @@ // CHECK1: omp.dispatch.end: // CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: @@ -253,8 +246,6 @@ // CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -295,8 +286,6 @@ // CHECK2-NEXT: br label [[DOTAWAIT_WORK]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14 // CHECK2-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK2-NEXT: entry: @@ -358,8 +347,6 @@ // CHECK2-NEXT: br label [[DOTEXIT]] // CHECK2: .exit: // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: @@ -448,8 +435,6 @@ // CHECK2: omp.dispatch.end: // CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) // CHECK2-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: @@ -470,4 +455,217 @@ // CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] // CHECK2-NEXT: ret void +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK: .await.work: +// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK: .select.workers: +// CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK: .execute.parallel: +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) +// CHECK-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK: .execute.fn: +// CHECK-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK: .check.next: +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK: .terminate.parallel: +// CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK: .barrier.parallel: +// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13 +// CHECK-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK: .worker: +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker() #[[ATTR3]] +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .mastercheck: +// CHECK-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK: .master: +// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK-NEXT: [[D:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D]] to i32* +// CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK-NEXT: store i32 [[TMP7]], i32* [[D_ON_STACK]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[D_ON_STACK]] to i8* +// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP12]], i64 2) +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[D]]) +// CHECK-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK: .termination.notifier: +// CHECK-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK-NEXT: br label [[DOTEXIT]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[D_ADDR]], align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK: omp.dispatch.cond: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK: omp.dispatch.body: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]] +// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP12]] +// CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK: omp.dispatch.inc: +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK: omp.dispatch.end: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to [10 x i32]** +// CHECK-NEXT: [[TMP5:%.*]] = load [10 x i32]*, [10 x i32]** [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** +// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 +// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]] +// CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -155,7 +155,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -257,7 +256,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK1: .termination.notifier: // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -332,7 +330,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 8 // CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 @@ -441,7 +438,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 @@ -577,7 +573,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 @@ -681,7 +676,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[CONV9:%.*]] = sitofp i32 [[TMP9]] to double // CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV9]], 1.500000e+00 @@ -711,56 +705,27 @@ // // // CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK1-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[F2:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -// CHECK1-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK1-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK1: .spmd: -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .non-spmd: -// CHECK1-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i64 4, i64 128 -// CHECK1-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[TMP5]], i16 0) -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK1-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] -// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] -// CHECK1-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK1-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32* +// CHECK1-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4 // CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i64 2) -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] -// CHECK1: .non-spmd4: -// CHECK1-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) -// CHECK1-NEXT: br label [[DOTEXIT5]] -// CHECK1: .exit5: -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK1-NEXT: ret i32 [[TMP20]] +// CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i64 2) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4 +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[F]]) +// CHECK1-NEXT: ret i32 [[TMP7]] // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker @@ -825,7 +790,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] // CHECK1-NEXT: unreachable // CHECK1: 5: @@ -909,7 +873,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 @@ -987,7 +950,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -1089,7 +1051,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK2: .termination.notifier: // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -1164,7 +1125,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 @@ -1272,7 +1232,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -1407,7 +1366,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -1510,7 +1468,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double // CHECK2-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 @@ -1540,56 +1497,27 @@ // // // CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK2-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[F2:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -// CHECK2-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK2-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK2: .spmd: -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .non-spmd: -// CHECK2-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128 -// CHECK2-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0) -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK2-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK2-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] -// CHECK2-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] -// CHECK2-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK2-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32* +// CHECK2-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4 // CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2) -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] -// CHECK2: .non-spmd4: -// CHECK2-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) -// CHECK2-NEXT: br label [[DOTEXIT5]] -// CHECK2: .exit5: -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK2-NEXT: ret i32 [[TMP20]] +// CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4 +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[F]]) +// CHECK2-NEXT: ret i32 [[TMP7]] // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker @@ -1654,7 +1582,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] // CHECK2-NEXT: unreachable // CHECK2: 5: @@ -1737,7 +1664,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -1815,7 +1741,6 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -1917,7 +1842,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK3: .termination.notifier: // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -1992,7 +1916,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1 @@ -2100,7 +2023,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -2235,7 +2157,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 @@ -2338,7 +2259,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double // CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 @@ -2368,56 +2288,27 @@ // // // CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd -// CHECK3-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F2:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -// CHECK3-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0 -// CHECK3-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]] -// CHECK3: .spmd: -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .non-spmd: -// CHECK3-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128 -// CHECK3-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0) -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]] -// CHECK3-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]] -// CHECK3-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]] -// CHECK3-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4 +// CHECK3-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32* +// CHECK3-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4 // CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2) -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4 -// CHECK3-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4 -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]] -// CHECK3: .non-spmd4: -// CHECK3-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8* -// CHECK3-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]]) -// CHECK3-NEXT: br label [[DOTEXIT5]] -// CHECK3: .exit5: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4 -// CHECK3-NEXT: ret i32 [[TMP20]] +// CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4 +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[F]]) +// CHECK3-NEXT: ret i32 [[TMP7]] // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker @@ -2482,7 +2373,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]] // CHECK3-NEXT: unreachable // CHECK3: 5: @@ -2565,7 +2455,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -60,7 +60,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -110,7 +109,6 @@ // CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -173,7 +171,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -223,7 +220,6 @@ // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -286,7 +282,6 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -336,7 +331,6 @@ // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -399,7 +393,6 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -449,7 +442,6 @@ // CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -512,7 +504,6 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -562,7 +553,6 @@ // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -625,7 +615,6 @@ // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -675,7 +664,6 @@ // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -55,7 +55,6 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -108,7 +107,6 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -172,7 +170,6 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -224,7 +221,6 @@ // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -288,7 +284,6 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -340,7 +335,6 @@ // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -404,7 +398,6 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -457,7 +450,6 @@ // CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK4: .execute: // CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -521,7 +513,6 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -573,7 +564,6 @@ // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK5: .execute: // CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) @@ -637,7 +627,6 @@ // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -689,7 +678,6 @@ // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 // CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK6: .execute: // CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -55,7 +55,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -73,7 +72,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -91,7 +89,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -55,7 +55,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] @@ -239,7 +238,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] @@ -501,7 +499,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp @@ -105,7 +105,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA6:![0-9]+]] // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR5]] @@ -135,115 +134,113 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]] // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2, !tbaa [[TBAA12:![0-9]+]] -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8, !tbaa [[TBAA14:![0-9]+]] -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[ISTART:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[IEND:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[PARTIAL_SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 2 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP6]]) #[[ATTR5]] +// CHECK1-NEXT: [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32* +// CHECK1-NEXT: [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32* +// CHECK1-NEXT: [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK1-NEXT: [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex"* +// CHECK1-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP1:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) #[[ATTR5]] // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP7]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP2]]) #[[ATTR5]] // CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP8]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP9]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP4]]) #[[ATTR5]] // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[IB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP10]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP12]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 99 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 99 // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK1: cond.true: // CHECK1-NEXT: br label [[COND_END:%.*]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: br label [[COND_END]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] // CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] // CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]] // CHECK1: omp.inner.for.cond.cleanup: // CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], 1 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP19:%.*]] = bitcast float* [[REF_TMP]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP19]]) #[[ATTR5]] -// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]] -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast float* [[REF_TMP2]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP20]]) #[[ATTR5]] -// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA16]] -// CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR7:[0-9]+]] -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast float* [[REF_TMP2]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP21]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP22:%.*]] = bitcast float* [[REF_TMP]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP22]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP23]], 4 -// CHECK1-NEXT: store i32 [[MUL3]], i32* [[ISTART]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast float* [[REF_TMP]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP14]]) #[[ATTR5]] +// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA12:![0-9]+]] +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast float* [[REF_TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP15]]) #[[ATTR5]] +// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA12]] +// CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR7:[0-9]+]] +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast float* [[REF_TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP16]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast float* [[REF_TMP]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP17]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP18]], 4 +// CHECK1-NEXT: store i32 [[MUL3]], i32* [[ISTART_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP19]], 1 // CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4 -// CHECK1-NEXT: store i32 [[MUL5]], i32* [[IEND]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i32* [[ISTART]] to i8* -// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast i32* [[IEND]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex"* [[PARTIAL_SUM]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex"*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP31]], i64 3) +// CHECK1-NEXT: store i32 [[MUL5]], i32* [[IEND_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[ISTART_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32* [[IEND_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast %"class.std::complex"* [[PARTIAL_SUM_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex"*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP26]], i64 3) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], 1 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], 1 // CHECK1-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK1: omp.inner.for.end: // CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]]) -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]]) +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP28]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP29]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP30]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP31]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP32:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP32]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP33]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP34]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP35:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP35]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP36:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP36]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP37:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP37]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP38:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP38]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP39:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP39]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[PARTIAL_SUM]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[IEND]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[ISTART]]) // CHECK1-NEXT: ret void // // @@ -345,10 +342,10 @@ // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP21]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP22:%.*]] = bitcast float* [[REF_TMP]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP22]]) #[[ATTR5]] -// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA16]] +// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA12]] // CHECK1-NEXT: [[TMP23:%.*]] = bitcast float* [[REF_TMP6]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP23]]) #[[ATTR5]] -// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA16]] +// CHECK1-NEXT: store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA12]] // CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR7]] // CHECK1-NEXT: [[TMP24:%.*]] = bitcast float* [[REF_TMP6]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP24]]) #[[ATTR5]] @@ -405,12 +402,12 @@ // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP41]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to float -// CHECK1-NEXT: store float [[CONV]], float* [[REF_TMP15]], align 4, !tbaa [[TBAA16]] +// CHECK1-NEXT: store float [[CONV]], float* [[REF_TMP15]], align 4, !tbaa [[TBAA12]] // CHECK1-NEXT: [[TMP43:%.*]] = bitcast float* [[REF_TMP16]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP43]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP44]] to float -// CHECK1-NEXT: store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA16]] +// CHECK1-NEXT: store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA12]] // CHECK1-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR7]] // CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR7]] // CHECK1-NEXT: [[TMP45:%.*]] = bitcast float* [[REF_TMP16]] to i8* @@ -493,15 +490,15 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR7]] // CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA14:![0-9]+]] // CHECK1-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]] -// CHECK1-NEXT: store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK1-NEXT: store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK1-NEXT: [[TMP2:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR7]] // CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]] +// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA16:![0-9]+]] // CHECK1-NEXT: [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]] -// CHECK1-NEXT: store float [[ADD3]], float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK1-NEXT: store float [[ADD3]], float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK1-NEXT: ret %"class.std::complex"* [[THIS1]] // // @@ -515,14 +512,14 @@ // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17:![0-9]+]] +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 @@ -574,7 +571,7 @@ // CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to %"class.std::complex"* // CHECK1-NEXT: [[TMP46:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8* // CHECK1-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP44]] to i8* -// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !21 +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !19 // CHECK1-NEXT: br label [[IFCONT6:%.*]] // CHECK1: else5: // CHECK1-NEXT: br label [[IFCONT6]] @@ -651,7 +648,7 @@ // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA12]] +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA17]] // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 @@ -691,10 +688,10 @@ // CHECK1: .execute.parallel: // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*) +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) // CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] // CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] +// CHECK1-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] // CHECK1: .check.next: // CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* @@ -740,7 +737,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR5]] @@ -770,142 +766,140 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 // CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared3", align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size4", align 8, !tbaa [[TBAA14]] -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[ISTART:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[IEND:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 2 -// CHECK1-NEXT: [[PARTIAL_SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP6]]) #[[ATTR5]] +// CHECK1-NEXT: [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32* +// CHECK1-NEXT: [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32* +// CHECK1-NEXT: [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 16) +// CHECK1-NEXT: [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex.0"* +// CHECK1-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP1:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) #[[ATTR5]] // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP7]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP2]]) #[[ATTR5]] // CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP8]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP9]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP4]]) #[[ATTR5]] // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[IB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP10]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 99 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 99 // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK1: cond.true: // CHECK1-NEXT: br label [[COND_END:%.*]] // CHECK1: cond.false: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: br label [[COND_END]] // CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] // CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] // CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]] // CHECK1: omp.inner.for.cond.cleanup: // CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], 1 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP19:%.*]] = bitcast double* [[REF_TMP]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP19]]) #[[ATTR5]] -// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA22:![0-9]+]] -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast double* [[REF_TMP2]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP20]]) #[[ATTR5]] -// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA22]] -// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR7]] -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast double* [[REF_TMP2]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP21]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[REF_TMP]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP22]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP23]], 4 -// CHECK1-NEXT: store i32 [[MUL3]], i32* [[ISTART]], align 8, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[REF_TMP]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP14]]) #[[ATTR5]] +// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA20:![0-9]+]] +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[REF_TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP15]]) #[[ATTR5]] +// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA20]] +// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR7]] +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[REF_TMP2]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP16]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast double* [[REF_TMP]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP17]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP18]], 4 +// CHECK1-NEXT: store i32 [[MUL3]], i32* [[ISTART_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP19]], 1 // CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4 -// CHECK1-NEXT: store i32 [[MUL5]], i32* [[IEND]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i32* [[ISTART]] to i8* -// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast i32* [[IEND]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex.1"*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** [[TMP31]], i64 3) +// CHECK1-NEXT: store i32 [[MUL5]], i32* [[IEND_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[ISTART_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32* [[IEND_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex.0"*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP26]], i64 3) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], 1 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], 1 // CHECK1-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK1: omp.inner.for.end: // CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]]) -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]]) +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP28]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP29]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP30]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP31]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP32:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP32]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP33]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP34]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP35:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP35]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP36:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP36]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP37:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP37]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP38:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP38]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP39:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared3", align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP39]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[PARTIAL_SUM]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[IEND]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[ISTART]]) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC1ERKdS2_ -// CHECK1-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { +// CHECK1-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK1-NEXT: [[__RE_ADDR:%.*]] = alloca double*, align 8 // CHECK1-NEXT: [[__IM_ADDR:%.*]] = alloca double*, align 8 -// CHECK1-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store double* [[__RE]], double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store double* [[__IM]], double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[__IM_ADDR]], align 8 -// CHECK1-NEXT: call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR7]] +// CHECK1-NEXT: call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR7]] // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ISTART:%.*]], i32* nonnull align 4 dereferenceable(4) [[IEND:%.*]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR0]] { +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ISTART:%.*]], i32* nonnull align 4 dereferenceable(4) [[IEND:%.*]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK1-NEXT: [[ISTART_ADDR:%.*]] = alloca i32*, align 8 // CHECK1-NEXT: [[IEND_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[PARTIAL_SUM_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK1-NEXT: [[PARTIAL_SUM_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -916,11 +910,11 @@ // CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[PARTIAL_SUM5:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK1-NEXT: [[PARTIAL_SUM5:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK1-NEXT: [[REF_TMP:%.*]] = alloca double, align 8 // CHECK1-NEXT: [[REF_TMP6:%.*]] = alloca double, align 8 // CHECK1-NEXT: [[I7:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[REF_TMP14:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK1-NEXT: [[REF_TMP14:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK1-NEXT: [[REF_TMP15:%.*]] = alloca double, align 8 // CHECK1-NEXT: [[REF_TMP16:%.*]] = alloca double, align 8 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 @@ -928,10 +922,10 @@ // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store i32* [[ISTART]], i32** [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store i32* [[IEND]], i32** [[IEND_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: store %"class.std::complex.1"* [[PARTIAL_SUM]], %"class.std::complex.1"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: store %"class.std::complex.0"* [[PARTIAL_SUM]], %"class.std::complex.0"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[IEND_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP2:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR_]] to i8* @@ -976,15 +970,15 @@ // CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP20]]) #[[ATTR5]] // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP21]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[REF_TMP]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP22]]) #[[ATTR5]] -// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA22]] +// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA20]] // CHECK1-NEXT: [[TMP23:%.*]] = bitcast double* [[REF_TMP6]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP23]]) #[[ATTR5]] -// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA22]] -// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR7]] +// CHECK1-NEXT: store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA20]] +// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR7]] // CHECK1-NEXT: [[TMP24:%.*]] = bitcast double* [[REF_TMP6]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP24]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP25:%.*]] = bitcast double* [[REF_TMP]] to i8* @@ -1034,25 +1028,25 @@ // CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP39]], 1 // CHECK1-NEXT: [[ADD13:%.*]] = add i32 [[TMP38]], [[MUL]] // CHECK1-NEXT: store i32 [[ADD13]], i32* [[I7]], align 4, !tbaa [[TBAA6]] -// CHECK1-NEXT: [[TMP40:%.*]] = bitcast %"class.std::complex.1"* [[REF_TMP14]] to i8* +// CHECK1-NEXT: [[TMP40:%.*]] = bitcast %"class.std::complex.0"* [[REF_TMP14]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP40]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP41:%.*]] = bitcast double* [[REF_TMP15]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP41]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to double -// CHECK1-NEXT: store double [[CONV]], double* [[REF_TMP15]], align 8, !tbaa [[TBAA22]] +// CHECK1-NEXT: store double [[CONV]], double* [[REF_TMP15]], align 8, !tbaa [[TBAA20]] // CHECK1-NEXT: [[TMP43:%.*]] = bitcast double* [[REF_TMP16]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP43]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP44]] to double -// CHECK1-NEXT: store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA22]] -// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR7]] -// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.1"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR7]] +// CHECK1-NEXT: store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA20]] +// CHECK1-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR7]] +// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR7]] // CHECK1-NEXT: [[TMP45:%.*]] = bitcast double* [[REF_TMP16]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP45]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP46:%.*]] = bitcast double* [[REF_TMP15]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP46]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex.1"* [[REF_TMP14]] to i8* +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex.0"* [[REF_TMP14]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP47]]) #[[ATTR5]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1081,20 +1075,20 @@ // CHECK1-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: [[TMP57:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP58:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK1-NEXT: [[TMP58:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK1-NEXT: store i8* [[TMP58]], i8** [[TMP57]], align 8 // CHECK1-NEXT: [[TMP59:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP56]], i32 1, i64 8, i8* [[TMP59]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK1-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP56]], i32 1, i64 8, i8* [[TMP59]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6) // CHECK1-NEXT: [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1 // CHECK1-NEXT: br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.1"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR7]] +// CHECK1-NEXT: [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR7]] // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]]) // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK1: .omp.reduction.done: // CHECK1-NEXT: [[TMP62:%.*]] = bitcast i32* [[I7]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP62]]) #[[ATTR5]] -// CHECK1-NEXT: [[TMP63:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP63]]) #[[ATTR5]] // CHECK1-NEXT: [[TMP64:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* // CHECK1-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP64]]) #[[ATTR5]] @@ -1118,29 +1112,29 @@ // // // CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEpLIdEERS0_RKS_IT_E -// CHECK1-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK1-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK1-NEXT: [[__C_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK1-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: store %"class.std::complex.1"* [[__C]], %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR7]] -// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]] +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK1-NEXT: [[__C_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK1-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: store %"class.std::complex.0"* [[__C]], %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR7]] +// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA22:![0-9]+]] // CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]] -// CHECK1-NEXT: store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA24]] -// CHECK1-NEXT: [[TMP2:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR7]] -// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]] +// CHECK1-NEXT: store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA22]] +// CHECK1-NEXT: [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR7]] +// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA24:![0-9]+]] // CHECK1-NEXT: [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]] -// CHECK1-NEXT: store double [[ADD3]], double* [[__IM_]], align 8, !tbaa [[TBAA26]] -// CHECK1-NEXT: ret %"class.std::complex.1"* [[THIS1]] +// CHECK1-NEXT: store double [[ADD3]], double* [[__IM_]], align 8, !tbaa [[TBAA24]] +// CHECK1-NEXT: ret %"class.std::complex.0"* [[THIS1]] // // -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 // CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 @@ -1148,24 +1142,24 @@ // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.1"* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr %"class.std::complex.1", %"class.std::complex.1"* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast %"class.std::complex.1"* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex.1"* [[TMP12]] to i64* -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex.1"* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.0"* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast %"class.std::complex.0"* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP12]] to i64* +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64* // CHECK1-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]] // CHECK1: .shuffle.pre_cond: // CHECK1-NEXT: [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP28:%.*]], [[DOTSHUFFLE_THEN:%.*]] ] @@ -1187,7 +1181,7 @@ // CHECK1-NEXT: [[TMP29]] = getelementptr i64, i64* [[TMP18]], i64 1 // CHECK1-NEXT: br label [[DOTSHUFFLE_PRE_COND]] // CHECK1: .shuffle.exit: -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.1"* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8* // CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP11]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP8]], 0 // CHECK1-NEXT: [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 1 @@ -1205,7 +1199,7 @@ // CHECK1: then: // CHECK1-NEXT: [[TMP43:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* // CHECK1-NEXT: [[TMP44:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]] // CHECK1-NEXT: br label [[IFCONT:%.*]] // CHECK1: else: // CHECK1-NEXT: br label [[IFCONT]] @@ -1219,11 +1213,11 @@ // CHECK1-NEXT: [[TMP49:%.*]] = load i8*, i8** [[TMP48]], align 8 // CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK1-NEXT: [[TMP51:%.*]] = load i8*, i8** [[TMP50]], align 8 -// CHECK1-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.1"* -// CHECK1-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.1"* -// CHECK1-NEXT: [[TMP54:%.*]] = bitcast %"class.std::complex.1"* [[TMP53]] to i8* -// CHECK1-NEXT: [[TMP55:%.*]] = bitcast %"class.std::complex.1"* [[TMP52]] to i8* -// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !27 +// CHECK1-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.0"* +// CHECK1-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.0"* +// CHECK1-NEXT: [[TMP54:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8* +// CHECK1-NEXT: [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP52]] to i8* +// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !25 // CHECK1-NEXT: br label [[IFCONT6:%.*]] // CHECK1: else5: // CHECK1-NEXT: br label [[IFCONT6]] @@ -1231,7 +1225,7 @@ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 // CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 @@ -1292,7 +1286,7 @@ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper // CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 @@ -1300,7 +1294,7 @@ // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA12]] +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA17]] // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA6]] // CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 @@ -1311,9 +1305,9 @@ // CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** // CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 2 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.1"** -// CHECK1-NEXT: [[TMP11:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[TMP10]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]], %"class.std::complex.1"* [[TMP11]]) #[[ATTR5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.0"** +// CHECK1-NEXT: [[TMP11:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP10]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]], %"class.std::complex.0"* [[TMP11]]) #[[ATTR5]] // CHECK1-NEXT: ret void // // @@ -1329,12 +1323,12 @@ // CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 // CHECK1-NEXT: [[TMP0:%.*]] = load float*, float** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP1:%.*]] = load float, float* [[TMP0]], align 4, !tbaa [[TBAA16]] -// CHECK1-NEXT: store float [[TMP1]], float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK1-NEXT: [[TMP1:%.*]] = load float, float* [[TMP0]], align 4, !tbaa [[TBAA12]] +// CHECK1-NEXT: store float [[TMP1]], float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 // CHECK1-NEXT: [[TMP2:%.*]] = load float*, float** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4, !tbaa [[TBAA16]] -// CHECK1-NEXT: store float [[TMP3]], float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4, !tbaa [[TBAA12]] +// CHECK1-NEXT: store float [[TMP3]], float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK1-NEXT: ret void // // @@ -1345,7 +1339,7 @@ // CHECK1-NEXT: store %"class.std::complex"* [[THIS]], %"class.std::complex"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK1-NEXT: [[TMP0:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK1-NEXT: ret float [[TMP0]] // // @@ -1356,50 +1350,50 @@ // CHECK1-NEXT: store %"class.std::complex"* [[THIS]], %"class.std::complex"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP0:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK1-NEXT: [[TMP0:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK1-NEXT: ret float [[TMP0]] // // // CHECK1-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC2ERKdS2_ -// CHECK1-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { +// CHECK1-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK1-NEXT: [[__RE_ADDR:%.*]] = alloca double*, align 8 // CHECK1-NEXT: [[__IM_ADDR:%.*]] = alloca double*, align 8 -// CHECK1-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store double* [[__RE]], double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK1-NEXT: store double* [[__IM]], double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 // CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[TMP0]], align 8, !tbaa [[TBAA22]] -// CHECK1-NEXT: store double [[TMP1]], double* [[__RE_]], align 8, !tbaa [[TBAA24]] -// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[TMP0]], align 8, !tbaa [[TBAA20]] +// CHECK1-NEXT: store double [[TMP1]], double* [[__RE_]], align 8, !tbaa [[TBAA22]] +// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 // CHECK1-NEXT: [[TMP2:%.*]] = load double*, double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8, !tbaa [[TBAA22]] -// CHECK1-NEXT: store double [[TMP3]], double* [[__IM_]], align 8, !tbaa [[TBAA26]] +// CHECK1-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8, !tbaa [[TBAA20]] +// CHECK1-NEXT: store double [[TMP3]], double* [[__IM_]], align 8, !tbaa [[TBAA24]] // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4realEv -// CHECK1-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK1-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK1-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24]] +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK1-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP0:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA22]] // CHECK1-NEXT: ret double [[TMP0]] // // // CHECK1-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4imagEv -// CHECK1-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK1-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK1-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP0:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26]] +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK1-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK1-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP0:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA24]] // CHECK1-NEXT: ret double [[TMP0]] // // @@ -1475,7 +1469,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA6:![0-9]+]] // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR5]] @@ -1505,115 +1498,113 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]] // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2, !tbaa [[TBAA12:![0-9]+]] -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8, !tbaa [[TBAA14:![0-9]+]] -// CHECK2-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[ISTART:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[IEND:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[PARTIAL_SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 2 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP6]]) #[[ATTR5]] +// CHECK2-NEXT: [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK2-NEXT: [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32* +// CHECK2-NEXT: [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK2-NEXT: [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32* +// CHECK2-NEXT: [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK2-NEXT: [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex"* +// CHECK2-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) #[[ATTR5]] // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP7]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP2]]) #[[ATTR5]] // CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP8]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP9]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP4]]) #[[ATTR5]] // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[IB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP10]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP12]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 99 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 99 // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK2: cond.true: // CHECK2-NEXT: br label [[COND_END:%.*]] // CHECK2: cond.false: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: br label [[COND_END]] // CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] // CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] // CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]] // CHECK2: omp.inner.for.cond.cleanup: // CHECK2-NEXT: br label [[OMP_INNER_FOR_END:%.*]] // CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], 1 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK2-NEXT: store i32 [[ADD]], i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP19:%.*]] = bitcast float* [[REF_TMP]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP19]]) #[[ATTR5]] -// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]] -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast float* [[REF_TMP2]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP20]]) #[[ATTR5]] -// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA16]] -// CHECK2-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR7:[0-9]+]] -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast float* [[REF_TMP2]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP21]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP22:%.*]] = bitcast float* [[REF_TMP]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP22]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP23]], 4 -// CHECK2-NEXT: store i32 [[MUL3]], i32* [[ISTART]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast float* [[REF_TMP]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP14]]) #[[ATTR5]] +// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA12:![0-9]+]] +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast float* [[REF_TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP15]]) #[[ATTR5]] +// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA12]] +// CHECK2-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR7:[0-9]+]] +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast float* [[REF_TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP16]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast float* [[REF_TMP]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP17]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP18]], 4 +// CHECK2-NEXT: store i32 [[MUL3]], i32* [[ISTART_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP19]], 1 // CHECK2-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4 -// CHECK2-NEXT: store i32 [[MUL5]], i32* [[IEND]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i32* [[ISTART]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP28:%.*]] = bitcast i32* [[IEND]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex"* [[PARTIAL_SUM]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex"*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP31]], i64 3) +// CHECK2-NEXT: store i32 [[MUL5]], i32* [[IEND_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i32* [[ISTART_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i32* [[IEND_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast %"class.std::complex"* [[PARTIAL_SUM_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex"*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP26]], i64 3) // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: // CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], 1 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], 1 // CHECK2-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK2: omp.inner.for.end: // CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]]) -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]]) +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP28]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP29:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP29]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP30]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP31:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP31]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP32]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP33]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP34]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP35:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP35]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP36]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP37:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP37]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP38:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP38]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP39:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP39]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[PARTIAL_SUM]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[IEND]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[ISTART]]) // CHECK2-NEXT: ret void // // @@ -1715,10 +1706,10 @@ // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP21]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP22:%.*]] = bitcast float* [[REF_TMP]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP22]]) #[[ATTR5]] -// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA16]] +// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA12]] // CHECK2-NEXT: [[TMP23:%.*]] = bitcast float* [[REF_TMP6]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP23]]) #[[ATTR5]] -// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA16]] +// CHECK2-NEXT: store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA12]] // CHECK2-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR7]] // CHECK2-NEXT: [[TMP24:%.*]] = bitcast float* [[REF_TMP6]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP24]]) #[[ATTR5]] @@ -1775,12 +1766,12 @@ // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP41]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to float -// CHECK2-NEXT: store float [[CONV]], float* [[REF_TMP15]], align 4, !tbaa [[TBAA16]] +// CHECK2-NEXT: store float [[CONV]], float* [[REF_TMP15]], align 4, !tbaa [[TBAA12]] // CHECK2-NEXT: [[TMP43:%.*]] = bitcast float* [[REF_TMP16]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP43]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP44]] to float -// CHECK2-NEXT: store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA16]] +// CHECK2-NEXT: store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA12]] // CHECK2-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR7]] // CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR7]] // CHECK2-NEXT: [[TMP45:%.*]] = bitcast float* [[REF_TMP16]] to i8* @@ -1863,15 +1854,15 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR7]] // CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]] +// CHECK2-NEXT: [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA14:![0-9]+]] // CHECK2-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]] -// CHECK2-NEXT: store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK2-NEXT: store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK2-NEXT: [[TMP2:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR7]] // CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]] +// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA16:![0-9]+]] // CHECK2-NEXT: [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]] -// CHECK2-NEXT: store float [[ADD3]], float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK2-NEXT: store float [[ADD3]], float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK2-NEXT: ret %"class.std::complex"* [[THIS1]] // // @@ -1885,14 +1876,14 @@ // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17:![0-9]+]] +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 @@ -1944,7 +1935,7 @@ // CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to %"class.std::complex"* // CHECK2-NEXT: [[TMP46:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8* // CHECK2-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP44]] to i8* -// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !21 +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !19 // CHECK2-NEXT: br label [[IFCONT6:%.*]] // CHECK2: else5: // CHECK2-NEXT: br label [[IFCONT6]] @@ -2021,7 +2012,7 @@ // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA12]] +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA17]] // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 @@ -2061,10 +2052,10 @@ // CHECK2: .execute.parallel: // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*) +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) // CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] // CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] +// CHECK2-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] // CHECK2: .check.next: // CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* @@ -2110,7 +2101,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR5]] @@ -2140,142 +2130,140 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 // CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared3", align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size4", align 8, !tbaa [[TBAA14]] -// CHECK2-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK2-NEXT: [[ISTART:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[IEND:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 2 -// CHECK2-NEXT: [[PARTIAL_SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP6]]) #[[ATTR5]] +// CHECK2-NEXT: [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK2-NEXT: [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32* +// CHECK2-NEXT: [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK2-NEXT: [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32* +// CHECK2-NEXT: [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 16) +// CHECK2-NEXT: [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex.0"* +// CHECK2-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) #[[ATTR5]] // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP7]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP2]]) #[[ATTR5]] // CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP8]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP9]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP4]]) #[[ATTR5]] // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[IB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP10]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 99 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 99 // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK2: cond.true: // CHECK2-NEXT: br label [[COND_END:%.*]] // CHECK2: cond.false: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: br label [[COND_END]] // CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] // CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] // CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]] // CHECK2: omp.inner.for.cond.cleanup: // CHECK2-NEXT: br label [[OMP_INNER_FOR_END:%.*]] // CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], 1 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK2-NEXT: store i32 [[ADD]], i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP19:%.*]] = bitcast double* [[REF_TMP]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP19]]) #[[ATTR5]] -// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA22:![0-9]+]] -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast double* [[REF_TMP2]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP20]]) #[[ATTR5]] -// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA22]] -// CHECK2-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR7]] -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast double* [[REF_TMP2]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP21]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[REF_TMP]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP22]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP23]], 4 -// CHECK2-NEXT: store i32 [[MUL3]], i32* [[ISTART]], align 8, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[REF_TMP]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP14]]) #[[ATTR5]] +// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA20:![0-9]+]] +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[REF_TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP15]]) #[[ATTR5]] +// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA20]] +// CHECK2-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR7]] +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[REF_TMP2]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP16]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast double* [[REF_TMP]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP17]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP18]], 4 +// CHECK2-NEXT: store i32 [[MUL3]], i32* [[ISTART_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP19]], 1 // CHECK2-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4 -// CHECK2-NEXT: store i32 [[MUL5]], i32* [[IEND]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i32* [[ISTART]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP28:%.*]] = bitcast i32* [[IEND]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex.1"*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** [[TMP31]], i64 3) +// CHECK2-NEXT: store i32 [[MUL5]], i32* [[IEND_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast i32* [[ISTART_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i32* [[IEND_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex.0"*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP26]], i64 3) // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: // CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], 1 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], 1 // CHECK2-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK2: omp.inner.for.end: // CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]]) -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]]) +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP28]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP29:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP29]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP30]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP31:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP31]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP32]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP33]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP34]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP35:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP35]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP36]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP37:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP37]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP38:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP38]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP39:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared3", align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP39]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[PARTIAL_SUM]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[IEND]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[ISTART]]) // CHECK2-NEXT: ret void // // // CHECK2-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC1ERKdS2_ -// CHECK2-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { +// CHECK2-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK2-NEXT: [[__RE_ADDR:%.*]] = alloca double*, align 8 // CHECK2-NEXT: [[__IM_ADDR:%.*]] = alloca double*, align 8 -// CHECK2-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store double* [[__RE]], double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store double* [[__IM]], double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8 // CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[__IM_ADDR]], align 8 -// CHECK2-NEXT: call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR7]] +// CHECK2-NEXT: call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR7]] // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ISTART:%.*]], i32* nonnull align 4 dereferenceable(4) [[IEND:%.*]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR0]] { +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ISTART:%.*]], i32* nonnull align 4 dereferenceable(4) [[IEND:%.*]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK2-NEXT: [[ISTART_ADDR:%.*]] = alloca i32*, align 8 // CHECK2-NEXT: [[IEND_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[PARTIAL_SUM_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK2-NEXT: [[PARTIAL_SUM_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -2286,11 +2274,11 @@ // CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[PARTIAL_SUM5:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK2-NEXT: [[PARTIAL_SUM5:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK2-NEXT: [[REF_TMP:%.*]] = alloca double, align 8 // CHECK2-NEXT: [[REF_TMP6:%.*]] = alloca double, align 8 // CHECK2-NEXT: [[I7:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[REF_TMP14:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK2-NEXT: [[REF_TMP14:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK2-NEXT: [[REF_TMP15:%.*]] = alloca double, align 8 // CHECK2-NEXT: [[REF_TMP16:%.*]] = alloca double, align 8 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 @@ -2298,10 +2286,10 @@ // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store i32* [[ISTART]], i32** [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store i32* [[IEND]], i32** [[IEND_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: store %"class.std::complex.1"* [[PARTIAL_SUM]], %"class.std::complex.1"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: store %"class.std::complex.0"* [[PARTIAL_SUM]], %"class.std::complex.0"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[IEND_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP2:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR_]] to i8* @@ -2346,15 +2334,15 @@ // CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP20]]) #[[ATTR5]] // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP21]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[REF_TMP]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP22]]) #[[ATTR5]] -// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA22]] +// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA20]] // CHECK2-NEXT: [[TMP23:%.*]] = bitcast double* [[REF_TMP6]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP23]]) #[[ATTR5]] -// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA22]] -// CHECK2-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR7]] +// CHECK2-NEXT: store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA20]] +// CHECK2-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR7]] // CHECK2-NEXT: [[TMP24:%.*]] = bitcast double* [[REF_TMP6]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP24]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP25:%.*]] = bitcast double* [[REF_TMP]] to i8* @@ -2404,25 +2392,25 @@ // CHECK2-NEXT: [[MUL:%.*]] = mul i32 [[TMP39]], 1 // CHECK2-NEXT: [[ADD13:%.*]] = add i32 [[TMP38]], [[MUL]] // CHECK2-NEXT: store i32 [[ADD13]], i32* [[I7]], align 4, !tbaa [[TBAA6]] -// CHECK2-NEXT: [[TMP40:%.*]] = bitcast %"class.std::complex.1"* [[REF_TMP14]] to i8* +// CHECK2-NEXT: [[TMP40:%.*]] = bitcast %"class.std::complex.0"* [[REF_TMP14]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP40]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP41:%.*]] = bitcast double* [[REF_TMP15]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP41]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to double -// CHECK2-NEXT: store double [[CONV]], double* [[REF_TMP15]], align 8, !tbaa [[TBAA22]] +// CHECK2-NEXT: store double [[CONV]], double* [[REF_TMP15]], align 8, !tbaa [[TBAA20]] // CHECK2-NEXT: [[TMP43:%.*]] = bitcast double* [[REF_TMP16]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP43]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP44]] to double -// CHECK2-NEXT: store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA22]] -// CHECK2-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR7]] -// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.1"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR7]] +// CHECK2-NEXT: store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA20]] +// CHECK2-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR7]] +// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR7]] // CHECK2-NEXT: [[TMP45:%.*]] = bitcast double* [[REF_TMP16]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP45]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP46:%.*]] = bitcast double* [[REF_TMP15]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP46]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex.1"* [[REF_TMP14]] to i8* +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex.0"* [[REF_TMP14]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP47]]) #[[ATTR5]] // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK2: omp.body.continue: @@ -2451,20 +2439,20 @@ // CHECK2-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: [[TMP57:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP58:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK2-NEXT: [[TMP58:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK2-NEXT: store i8* [[TMP58]], i8** [[TMP57]], align 8 // CHECK2-NEXT: [[TMP59:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP56]], i32 1, i64 8, i8* [[TMP59]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK2-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP56]], i32 1, i64 8, i8* [[TMP59]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6) // CHECK2-NEXT: [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1 // CHECK2-NEXT: br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.1"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR7]] +// CHECK2-NEXT: [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR7]] // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]]) // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK2: .omp.reduction.done: // CHECK2-NEXT: [[TMP62:%.*]] = bitcast i32* [[I7]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP62]]) #[[ATTR5]] -// CHECK2-NEXT: [[TMP63:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP63]]) #[[ATTR5]] // CHECK2-NEXT: [[TMP64:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* // CHECK2-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP64]]) #[[ATTR5]] @@ -2488,29 +2476,29 @@ // // // CHECK2-LABEL: define {{[^@]+}}@_ZNSt7complexIdEpLIdEERS0_RKS_IT_E -// CHECK2-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK2-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK2-NEXT: [[__C_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK2-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: store %"class.std::complex.1"* [[__C]], %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR7]] -// CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]] +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK2-NEXT: [[__C_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK2-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: store %"class.std::complex.0"* [[__C]], %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR7]] +// CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA22:![0-9]+]] // CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]] -// CHECK2-NEXT: store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA24]] -// CHECK2-NEXT: [[TMP2:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR7]] -// CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]] +// CHECK2-NEXT: store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA22]] +// CHECK2-NEXT: [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR7]] +// CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA24:![0-9]+]] // CHECK2-NEXT: [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]] -// CHECK2-NEXT: store double [[ADD3]], double* [[__IM_]], align 8, !tbaa [[TBAA26]] -// CHECK2-NEXT: ret %"class.std::complex.1"* [[THIS1]] +// CHECK2-NEXT: store double [[ADD3]], double* [[__IM_]], align 8, !tbaa [[TBAA24]] +// CHECK2-NEXT: ret %"class.std::complex.0"* [[THIS1]] // // -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 // CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 @@ -2518,24 +2506,24 @@ // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.1"* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr %"class.std::complex.1", %"class.std::complex.1"* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast %"class.std::complex.1"* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex.1"* [[TMP12]] to i64* -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex.1"* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.0"* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP12]], i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast %"class.std::complex.0"* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP12]] to i64* +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64* // CHECK2-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]] // CHECK2: .shuffle.pre_cond: // CHECK2-NEXT: [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP28:%.*]], [[DOTSHUFFLE_THEN:%.*]] ] @@ -2557,7 +2545,7 @@ // CHECK2-NEXT: [[TMP29]] = getelementptr i64, i64* [[TMP18]], i64 1 // CHECK2-NEXT: br label [[DOTSHUFFLE_PRE_COND]] // CHECK2: .shuffle.exit: -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.1"* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8* // CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP11]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP8]], 0 // CHECK2-NEXT: [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 1 @@ -2575,7 +2563,7 @@ // CHECK2: then: // CHECK2-NEXT: [[TMP43:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* // CHECK2-NEXT: [[TMP44:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]] // CHECK2-NEXT: br label [[IFCONT:%.*]] // CHECK2: else: // CHECK2-NEXT: br label [[IFCONT]] @@ -2589,11 +2577,11 @@ // CHECK2-NEXT: [[TMP49:%.*]] = load i8*, i8** [[TMP48]], align 8 // CHECK2-NEXT: [[TMP50:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK2-NEXT: [[TMP51:%.*]] = load i8*, i8** [[TMP50]], align 8 -// CHECK2-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.1"* -// CHECK2-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.1"* -// CHECK2-NEXT: [[TMP54:%.*]] = bitcast %"class.std::complex.1"* [[TMP53]] to i8* -// CHECK2-NEXT: [[TMP55:%.*]] = bitcast %"class.std::complex.1"* [[TMP52]] to i8* -// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !27 +// CHECK2-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.0"* +// CHECK2-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.0"* +// CHECK2-NEXT: [[TMP54:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8* +// CHECK2-NEXT: [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP52]] to i8* +// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !25 // CHECK2-NEXT: br label [[IFCONT6:%.*]] // CHECK2: else5: // CHECK2-NEXT: br label [[IFCONT6]] @@ -2601,7 +2589,7 @@ // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 // CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 @@ -2662,7 +2650,7 @@ // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper // CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 @@ -2670,7 +2658,7 @@ // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA12]] +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA17]] // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA6]] // CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 @@ -2681,9 +2669,9 @@ // CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** // CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 2 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.1"** -// CHECK2-NEXT: [[TMP11:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[TMP10]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]], %"class.std::complex.1"* [[TMP11]]) #[[ATTR5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.0"** +// CHECK2-NEXT: [[TMP11:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP10]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]], %"class.std::complex.0"* [[TMP11]]) #[[ATTR5]] // CHECK2-NEXT: ret void // // @@ -2699,12 +2687,12 @@ // CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 // CHECK2-NEXT: [[TMP0:%.*]] = load float*, float** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP1:%.*]] = load float, float* [[TMP0]], align 4, !tbaa [[TBAA16]] -// CHECK2-NEXT: store float [[TMP1]], float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK2-NEXT: [[TMP1:%.*]] = load float, float* [[TMP0]], align 4, !tbaa [[TBAA12]] +// CHECK2-NEXT: store float [[TMP1]], float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 // CHECK2-NEXT: [[TMP2:%.*]] = load float*, float** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4, !tbaa [[TBAA16]] -// CHECK2-NEXT: store float [[TMP3]], float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4, !tbaa [[TBAA12]] +// CHECK2-NEXT: store float [[TMP3]], float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK2-NEXT: ret void // // @@ -2715,7 +2703,7 @@ // CHECK2-NEXT: store %"class.std::complex"* [[THIS]], %"class.std::complex"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP0:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK2-NEXT: [[TMP0:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK2-NEXT: ret float [[TMP0]] // // @@ -2726,50 +2714,50 @@ // CHECK2-NEXT: store %"class.std::complex"* [[THIS]], %"class.std::complex"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP0:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK2-NEXT: [[TMP0:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK2-NEXT: ret float [[TMP0]] // // // CHECK2-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC2ERKdS2_ -// CHECK2-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { +// CHECK2-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK2-NEXT: [[__RE_ADDR:%.*]] = alloca double*, align 8 // CHECK2-NEXT: [[__IM_ADDR:%.*]] = alloca double*, align 8 -// CHECK2-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store double* [[__RE]], double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK2-NEXT: store double* [[__IM]], double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 // CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[TMP0]], align 8, !tbaa [[TBAA22]] -// CHECK2-NEXT: store double [[TMP1]], double* [[__RE_]], align 8, !tbaa [[TBAA24]] -// CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[TMP0]], align 8, !tbaa [[TBAA20]] +// CHECK2-NEXT: store double [[TMP1]], double* [[__RE_]], align 8, !tbaa [[TBAA22]] +// CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 // CHECK2-NEXT: [[TMP2:%.*]] = load double*, double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8, !tbaa [[TBAA22]] -// CHECK2-NEXT: store double [[TMP3]], double* [[__IM_]], align 8, !tbaa [[TBAA26]] +// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8, !tbaa [[TBAA20]] +// CHECK2-NEXT: store double [[TMP3]], double* [[__IM_]], align 8, !tbaa [[TBAA24]] // CHECK2-NEXT: ret void // // // CHECK2-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4realEv -// CHECK2-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK2-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK2-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP0:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24]] +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK2-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP0:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA22]] // CHECK2-NEXT: ret double [[TMP0]] // // // CHECK2-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4imagEv -// CHECK2-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK2-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK2-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP0:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26]] +// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK2-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK2-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK2-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP0:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA24]] // CHECK2-NEXT: ret double [[TMP0]] // // @@ -2845,7 +2833,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA6:![0-9]+]] // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR5]] @@ -2875,115 +2862,113 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]] // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2, !tbaa [[TBAA12:![0-9]+]] -// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8, !tbaa [[TBAA14:![0-9]+]] -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[ISTART:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[IEND:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[PARTIAL_SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP6]]) #[[ATTR5]] +// CHECK3-NEXT: [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK3-NEXT: [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32* +// CHECK3-NEXT: [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK3-NEXT: [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32* +// CHECK3-NEXT: [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK3-NEXT: [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex"* +// CHECK3-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP1:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) #[[ATTR5]] // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP7]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP2]]) #[[ATTR5]] // CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP8]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP9]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP4]]) #[[ATTR5]] // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[IB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP10]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP12]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 99 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP7]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 99 // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK3: cond.true: // CHECK3-NEXT: br label [[COND_END:%.*]] // CHECK3: cond.false: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: br label [[COND_END]] // CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] // CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] // CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]] // CHECK3: omp.inner.for.cond.cleanup: // CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP19:%.*]] = bitcast float* [[REF_TMP]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP19]]) #[[ATTR5]] -// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]] -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast float* [[REF_TMP2]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP20]]) #[[ATTR5]] -// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA16]] -// CHECK3-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR7:[0-9]+]] -// CHECK3-NEXT: [[TMP21:%.*]] = bitcast float* [[REF_TMP2]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP21]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP22:%.*]] = bitcast float* [[REF_TMP]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP22]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP23]], 4 -// CHECK3-NEXT: store i32 [[MUL3]], i32* [[ISTART]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast float* [[REF_TMP]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP14]]) #[[ATTR5]] +// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA12:![0-9]+]] +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast float* [[REF_TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP15]]) #[[ATTR5]] +// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP2]], align 4, !tbaa [[TBAA12]] +// CHECK3-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM_ON_STACK]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast float* [[REF_TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP16]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast float* [[REF_TMP]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP17]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP18]], 4 +// CHECK3-NEXT: store i32 [[MUL3]], i32* [[ISTART_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP19]], 1 // CHECK3-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4 -// CHECK3-NEXT: store i32 [[MUL5]], i32* [[IEND]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i32* [[ISTART]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[IEND]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex"* [[PARTIAL_SUM]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP31:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex"*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP31]], i64 3) +// CHECK3-NEXT: store i32 [[MUL5]], i32* [[IEND_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i32* [[ISTART_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i32* [[IEND_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast %"class.std::complex"* [[PARTIAL_SUM_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex"*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP26]], i64 3) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], 1 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], 1 // CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]]) -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]]) +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP28]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP29:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP29]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP30]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP31:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP31]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP32]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP33]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP34]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP35:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP35]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP36:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP36]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP37:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP37]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP38:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP38]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP39:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP39]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[PARTIAL_SUM]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[IEND]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[ISTART]]) // CHECK3-NEXT: ret void // // @@ -3085,10 +3070,10 @@ // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP21]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP22:%.*]] = bitcast float* [[REF_TMP]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP22]]) #[[ATTR5]] -// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA16]] +// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP]], align 4, !tbaa [[TBAA12]] // CHECK3-NEXT: [[TMP23:%.*]] = bitcast float* [[REF_TMP6]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP23]]) #[[ATTR5]] -// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA16]] +// CHECK3-NEXT: store float 0.000000e+00, float* [[REF_TMP6]], align 4, !tbaa [[TBAA12]] // CHECK3-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], float* nonnull align 4 dereferenceable(4) [[REF_TMP]], float* nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR7]] // CHECK3-NEXT: [[TMP24:%.*]] = bitcast float* [[REF_TMP6]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP24]]) #[[ATTR5]] @@ -3145,12 +3130,12 @@ // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP41]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to float -// CHECK3-NEXT: store float [[CONV]], float* [[REF_TMP15]], align 4, !tbaa [[TBAA16]] +// CHECK3-NEXT: store float [[CONV]], float* [[REF_TMP15]], align 4, !tbaa [[TBAA12]] // CHECK3-NEXT: [[TMP43:%.*]] = bitcast float* [[REF_TMP16]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP43]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP44]] to float -// CHECK3-NEXT: store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA16]] +// CHECK3-NEXT: store float [[CONV17]], float* [[REF_TMP16]], align 4, !tbaa [[TBAA12]] // CHECK3-NEXT: call void @_ZNSt7complexIfEC1ERKfS2_(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]], float* nonnull align 4 dereferenceable(4) [[REF_TMP15]], float* nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR7]] // CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) %"class.std::complex"* @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], %"class.std::complex"* nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR7]] // CHECK3-NEXT: [[TMP45:%.*]] = bitcast float* [[REF_TMP16]] to i8* @@ -3233,15 +3218,15 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR7]] // CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]] +// CHECK3-NEXT: [[TMP1:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA14:![0-9]+]] // CHECK3-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]] -// CHECK3-NEXT: store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK3-NEXT: store float [[ADD]], float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK3-NEXT: [[TMP2:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(%"class.std::complex"* nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR7]] // CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]] +// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA16:![0-9]+]] // CHECK3-NEXT: [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]] -// CHECK3-NEXT: store float [[ADD3]], float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK3-NEXT: store float [[ADD3]], float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK3-NEXT: ret %"class.std::complex"* [[THIS1]] // // @@ -3255,14 +3240,14 @@ // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17:![0-9]+]] +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 @@ -3314,7 +3299,7 @@ // CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to %"class.std::complex"* // CHECK3-NEXT: [[TMP46:%.*]] = bitcast %"class.std::complex"* [[TMP45]] to i8* // CHECK3-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex"* [[TMP44]] to i8* -// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !21 +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP46]], i8* align 4 [[TMP47]], i64 8, i1 false), !tbaa.struct !19 // CHECK3-NEXT: br label [[IFCONT6:%.*]] // CHECK3: else5: // CHECK3-NEXT: br label [[IFCONT6]] @@ -3391,7 +3376,7 @@ // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA12]] +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA17]] // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 @@ -3431,10 +3416,10 @@ // CHECK3: .execute.parallel: // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*) +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*) // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] // CHECK3: .execute.fn: -// CHECK3-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] +// CHECK3-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR5]] // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] // CHECK3: .check.next: // CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* @@ -3480,7 +3465,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR5]] @@ -3510,142 +3494,140 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 // CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared3", align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size4", align 8, !tbaa [[TBAA14]] -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[ISTART:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[IEND:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 2 -// CHECK3-NEXT: [[PARTIAL_SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP6]]) #[[ATTR5]] +// CHECK3-NEXT: [[ISTART:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK3-NEXT: [[ISTART_ON_STACK:%.*]] = bitcast i8* [[ISTART]] to i32* +// CHECK3-NEXT: [[IEND:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK3-NEXT: [[IEND_ON_STACK:%.*]] = bitcast i8* [[IEND]] to i32* +// CHECK3-NEXT: [[PARTIAL_SUM:%.*]] = call i8* @__kmpc_alloc_shared(i64 16) +// CHECK3-NEXT: [[PARTIAL_SUM_ON_STACK:%.*]] = bitcast i8* [[PARTIAL_SUM]] to %"class.std::complex.0"* +// CHECK3-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP0]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP1:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) #[[ATTR5]] // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP7]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP2]]) #[[ATTR5]] // CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP8]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP9]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP4]]) #[[ATTR5]] // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[IB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP10]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 99 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP5]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 99 // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK3: cond.true: // CHECK3-NEXT: br label [[COND_END:%.*]] // CHECK3: cond.false: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: br label [[COND_END]] // CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] // CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] // CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]] // CHECK3: omp.inner.for.cond.cleanup: // CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP13]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP19:%.*]] = bitcast double* [[REF_TMP]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP19]]) #[[ATTR5]] -// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA22:![0-9]+]] -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast double* [[REF_TMP2]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP20]]) #[[ATTR5]] -// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA22]] -// CHECK3-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR7]] -// CHECK3-NEXT: [[TMP21:%.*]] = bitcast double* [[REF_TMP2]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP21]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[REF_TMP]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP22]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP23]], 4 -// CHECK3-NEXT: store i32 [[MUL3]], i32* [[ISTART]], align 8, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[REF_TMP]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP14]]) #[[ATTR5]] +// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA20:![0-9]+]] +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[REF_TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP15]]) #[[ATTR5]] +// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP2]], align 8, !tbaa [[TBAA20]] +// CHECK3-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM_ON_STACK]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR7]] +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[REF_TMP2]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP16]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast double* [[REF_TMP]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP17]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP18]], 4 +// CHECK3-NEXT: store i32 [[MUL3]], i32* [[ISTART_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[IB]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP19]], 1 // CHECK3-NEXT: [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4 -// CHECK3-NEXT: store i32 [[MUL5]], i32* [[IEND]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i32* [[ISTART]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[IEND]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP31:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex.1"*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** [[TMP31]], i64 3) +// CHECK3-NEXT: store i32 [[MUL5]], i32* [[IEND_ON_STACK]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK3-NEXT: [[TMP21:%.*]] = bitcast i32* [[ISTART_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i32* [[IEND_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*, %"class.std::complex.0"*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** [[TMP26]], i64 3) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], 1 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], 1 // CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP12]]) -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]]) +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[IB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP28]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP29:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP29]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP30]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP31:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP31]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* +// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP32]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP33]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP34]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP35:%.*]] = bitcast i32* [[DOTOMP_STRIDE]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP35]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP36:%.*]] = bitcast i32* [[DOTOMP_UB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP36]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP37:%.*]] = bitcast i32* [[DOTOMP_LB]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP37]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP38:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* -// CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP38]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP39:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared3", align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP39]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[PARTIAL_SUM]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[IEND]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[ISTART]]) // CHECK3-NEXT: ret void // // // CHECK3-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC1ERKdS2_ -// CHECK3-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { +// CHECK3-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK3-NEXT: [[__RE_ADDR:%.*]] = alloca double*, align 8 // CHECK3-NEXT: [[__IM_ADDR:%.*]] = alloca double*, align 8 -// CHECK3-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store double* [[__RE]], double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store double* [[__IM]], double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8 // CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[__IM_ADDR]], align 8 -// CHECK3-NEXT: call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR7]] +// CHECK3-NEXT: call void @_ZNSt7complexIdEC2ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS1]], double* nonnull align 8 dereferenceable(8) [[TMP0]], double* nonnull align 8 dereferenceable(8) [[TMP1]]) #[[ATTR7]] // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ISTART:%.*]], i32* nonnull align 4 dereferenceable(4) [[IEND:%.*]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR0]] { +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ISTART:%.*]], i32* nonnull align 4 dereferenceable(4) [[IEND:%.*]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 // CHECK3-NEXT: [[ISTART_ADDR:%.*]] = alloca i32*, align 8 // CHECK3-NEXT: [[IEND_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[PARTIAL_SUM_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK3-NEXT: [[PARTIAL_SUM_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -3656,11 +3638,11 @@ // CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[PARTIAL_SUM5:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK3-NEXT: [[PARTIAL_SUM5:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK3-NEXT: [[REF_TMP:%.*]] = alloca double, align 8 // CHECK3-NEXT: [[REF_TMP6:%.*]] = alloca double, align 8 // CHECK3-NEXT: [[I7:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[REF_TMP14:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK3-NEXT: [[REF_TMP14:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK3-NEXT: [[REF_TMP15:%.*]] = alloca double, align 8 // CHECK3-NEXT: [[REF_TMP16:%.*]] = alloca double, align 8 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 @@ -3668,10 +3650,10 @@ // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store i32* [[ISTART]], i32** [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store i32* [[IEND]], i32** [[IEND_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: store %"class.std::complex.1"* [[PARTIAL_SUM]], %"class.std::complex.1"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: store %"class.std::complex.0"* [[PARTIAL_SUM]], %"class.std::complex.0"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[IEND_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP2:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_IV]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP3]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR_]] to i8* @@ -3716,15 +3698,15 @@ // CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP20]]) #[[ATTR5]] // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP21:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK3-NEXT: [[TMP21:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP21]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[REF_TMP]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP22]]) #[[ATTR5]] -// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA22]] +// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP]], align 8, !tbaa [[TBAA20]] // CHECK3-NEXT: [[TMP23:%.*]] = bitcast double* [[REF_TMP6]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP23]]) #[[ATTR5]] -// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA22]] -// CHECK3-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR7]] +// CHECK3-NEXT: store double 0.000000e+00, double* [[REF_TMP6]], align 8, !tbaa [[TBAA20]] +// CHECK3-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], double* nonnull align 8 dereferenceable(8) [[REF_TMP]], double* nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR7]] // CHECK3-NEXT: [[TMP24:%.*]] = bitcast double* [[REF_TMP6]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP24]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP25:%.*]] = bitcast double* [[REF_TMP]] to i8* @@ -3774,25 +3756,25 @@ // CHECK3-NEXT: [[MUL:%.*]] = mul i32 [[TMP39]], 1 // CHECK3-NEXT: [[ADD13:%.*]] = add i32 [[TMP38]], [[MUL]] // CHECK3-NEXT: store i32 [[ADD13]], i32* [[I7]], align 4, !tbaa [[TBAA6]] -// CHECK3-NEXT: [[TMP40:%.*]] = bitcast %"class.std::complex.1"* [[REF_TMP14]] to i8* +// CHECK3-NEXT: [[TMP40:%.*]] = bitcast %"class.std::complex.0"* [[REF_TMP14]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP40]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP41:%.*]] = bitcast double* [[REF_TMP15]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP41]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to double -// CHECK3-NEXT: store double [[CONV]], double* [[REF_TMP15]], align 8, !tbaa [[TBAA22]] +// CHECK3-NEXT: store double [[CONV]], double* [[REF_TMP15]], align 8, !tbaa [[TBAA20]] // CHECK3-NEXT: [[TMP43:%.*]] = bitcast double* [[REF_TMP16]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[TMP43]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[I7]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: [[CONV17:%.*]] = sitofp i32 [[TMP44]] to double -// CHECK3-NEXT: store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA22]] -// CHECK3-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR7]] -// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.1"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR7]] +// CHECK3-NEXT: store double [[CONV17]], double* [[REF_TMP16]], align 8, !tbaa [[TBAA20]] +// CHECK3-NEXT: call void @_ZNSt7complexIdEC1ERKdS2_(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]], double* nonnull align 8 dereferenceable(8) [[REF_TMP15]], double* nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR7]] +// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR7]] // CHECK3-NEXT: [[TMP45:%.*]] = bitcast double* [[REF_TMP16]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP45]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP46:%.*]] = bitcast double* [[REF_TMP15]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[TMP46]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex.1"* [[REF_TMP14]] to i8* +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast %"class.std::complex.0"* [[REF_TMP14]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP47]]) #[[ATTR5]] // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -3821,20 +3803,20 @@ // CHECK3-NEXT: [[TMP55:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: [[TMP57:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP58:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK3-NEXT: [[TMP58:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK3-NEXT: store i8* [[TMP58]], i8** [[TMP57]], align 8 // CHECK3-NEXT: [[TMP59:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP56]], i32 1, i64 8, i8* [[TMP59]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK3-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP56]], i32 1, i64 8, i8* [[TMP59]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6) // CHECK3-NEXT: [[TMP61:%.*]] = icmp eq i32 [[TMP60]], 1 // CHECK3-NEXT: br i1 [[TMP61]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.1"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR7]] +// CHECK3-NEXT: [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) %"class.std::complex.0"* @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR7]] // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP56]]) // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK3: .omp.reduction.done: // CHECK3-NEXT: [[TMP62:%.*]] = bitcast i32* [[I7]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP62]]) #[[ATTR5]] -// CHECK3-NEXT: [[TMP63:%.*]] = bitcast %"class.std::complex.1"* [[PARTIAL_SUM5]] to i8* +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast %"class.std::complex.0"* [[PARTIAL_SUM5]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP63]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP64:%.*]] = bitcast i32* [[DOTOMP_IS_LAST]] to i8* // CHECK3-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP64]]) #[[ATTR5]] @@ -3858,29 +3840,29 @@ // // // CHECK3-LABEL: define {{[^@]+}}@_ZNSt7complexIdEpLIdEERS0_RKS_IT_E -// CHECK3-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], %"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK3-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], %"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[__C:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK3-NEXT: [[__C_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK3-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: store %"class.std::complex.1"* [[__C]], %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR7]] -// CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]] +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK3-NEXT: [[__C_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK3-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: store %"class.std::complex.0"* [[__C]], %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR7]] +// CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA22:![0-9]+]] // CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]] -// CHECK3-NEXT: store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA24]] -// CHECK3-NEXT: [[TMP2:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR7]] -// CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]] +// CHECK3-NEXT: store double [[ADD]], double* [[__RE_]], align 8, !tbaa [[TBAA22]] +// CHECK3-NEXT: [[TMP2:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[__C_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR7]] +// CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA24:![0-9]+]] // CHECK3-NEXT: [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]] -// CHECK3-NEXT: store double [[ADD3]], double* [[__IM_]], align 8, !tbaa [[TBAA26]] -// CHECK3-NEXT: ret %"class.std::complex.1"* [[THIS1]] +// CHECK3-NEXT: store double [[ADD3]], double* [[__IM_]], align 8, !tbaa [[TBAA24]] +// CHECK3-NEXT: ret %"class.std::complex.0"* [[THIS1]] // // -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 // CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 @@ -3888,24 +3870,24 @@ // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.1", align 8 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8 // CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA12]] -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA12]] +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2, !tbaa [[TBAA17]] +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2, !tbaa [[TBAA17]] // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.1"* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr %"class.std::complex.1", %"class.std::complex.1"* [[TMP12]], i64 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast %"class.std::complex.1"* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex.1"* [[TMP12]] to i64* -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex.1"* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to %"class.std::complex.0"* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr %"class.std::complex.0", %"class.std::complex.0"* [[TMP12]], i64 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast %"class.std::complex.0"* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex.0"* [[TMP12]] to i64* +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i64* // CHECK3-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]] // CHECK3: .shuffle.pre_cond: // CHECK3-NEXT: [[TMP17:%.*]] = phi i64* [ [[TMP15]], [[ENTRY:%.*]] ], [ [[TMP28:%.*]], [[DOTSHUFFLE_THEN:%.*]] ] @@ -3927,7 +3909,7 @@ // CHECK3-NEXT: [[TMP29]] = getelementptr i64, i64* [[TMP18]], i64 1 // CHECK3-NEXT: br label [[DOTSHUFFLE_PRE_COND]] // CHECK3: .shuffle.exit: -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.1"* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast %"class.std::complex.0"* [[DOTOMP_REDUCTION_ELEMENT]] to i8* // CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP11]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP8]], 0 // CHECK3-NEXT: [[TMP32:%.*]] = icmp eq i16 [[TMP8]], 1 @@ -3945,7 +3927,7 @@ // CHECK3: then: // CHECK3-NEXT: [[TMP43:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* // CHECK3-NEXT: [[TMP44:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP43]], i8* [[TMP44]]) #[[ATTR5]] // CHECK3-NEXT: br label [[IFCONT:%.*]] // CHECK3: else: // CHECK3-NEXT: br label [[IFCONT]] @@ -3959,11 +3941,11 @@ // CHECK3-NEXT: [[TMP49:%.*]] = load i8*, i8** [[TMP48]], align 8 // CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 // CHECK3-NEXT: [[TMP51:%.*]] = load i8*, i8** [[TMP50]], align 8 -// CHECK3-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.1"* -// CHECK3-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.1"* -// CHECK3-NEXT: [[TMP54:%.*]] = bitcast %"class.std::complex.1"* [[TMP53]] to i8* -// CHECK3-NEXT: [[TMP55:%.*]] = bitcast %"class.std::complex.1"* [[TMP52]] to i8* -// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !27 +// CHECK3-NEXT: [[TMP52:%.*]] = bitcast i8* [[TMP49]] to %"class.std::complex.0"* +// CHECK3-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP51]] to %"class.std::complex.0"* +// CHECK3-NEXT: [[TMP54:%.*]] = bitcast %"class.std::complex.0"* [[TMP53]] to i8* +// CHECK3-NEXT: [[TMP55:%.*]] = bitcast %"class.std::complex.0"* [[TMP52]] to i8* +// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP54]], i8* align 8 [[TMP55]], i64 16, i1 false), !tbaa.struct !25 // CHECK3-NEXT: br label [[IFCONT6:%.*]] // CHECK3: else5: // CHECK3-NEXT: br label [[IFCONT6]] @@ -3971,7 +3953,7 @@ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 // CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 @@ -4032,7 +4014,7 @@ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper // CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 @@ -4040,7 +4022,7 @@ // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA12]] +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2, !tbaa [[TBAA17]] // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA6]] // CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) // CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 @@ -4051,9 +4033,9 @@ // CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** // CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 2 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.1"** -// CHECK3-NEXT: [[TMP11:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[TMP10]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]], %"class.std::complex.1"* [[TMP11]]) #[[ATTR5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8** [[TMP9]] to %"class.std::complex.0"** +// CHECK3-NEXT: [[TMP11:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[TMP10]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]], %"class.std::complex.0"* [[TMP11]]) #[[ATTR5]] // CHECK3-NEXT: ret void // // @@ -4069,12 +4051,12 @@ // CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 // CHECK3-NEXT: [[TMP0:%.*]] = load float*, float** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP1:%.*]] = load float, float* [[TMP0]], align 4, !tbaa [[TBAA16]] -// CHECK3-NEXT: store float [[TMP1]], float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK3-NEXT: [[TMP1:%.*]] = load float, float* [[TMP0]], align 4, !tbaa [[TBAA12]] +// CHECK3-NEXT: store float [[TMP1]], float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 // CHECK3-NEXT: [[TMP2:%.*]] = load float*, float** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4, !tbaa [[TBAA16]] -// CHECK3-NEXT: store float [[TMP3]], float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4, !tbaa [[TBAA12]] +// CHECK3-NEXT: store float [[TMP3]], float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK3-NEXT: ret void // // @@ -4085,7 +4067,7 @@ // CHECK3-NEXT: store %"class.std::complex"* [[THIS]], %"class.std::complex"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA18]] +// CHECK3-NEXT: [[TMP0:%.*]] = load float, float* [[__RE_]], align 4, !tbaa [[TBAA14]] // CHECK3-NEXT: ret float [[TMP0]] // // @@ -4096,49 +4078,49 @@ // CHECK3-NEXT: store %"class.std::complex"* [[THIS]], %"class.std::complex"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex"*, %"class.std::complex"** [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", %"class.std::complex"* [[THIS1]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP0:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA20]] +// CHECK3-NEXT: [[TMP0:%.*]] = load float, float* [[__IM_]], align 4, !tbaa [[TBAA16]] // CHECK3-NEXT: ret float [[TMP0]] // // // CHECK3-LABEL: define {{[^@]+}}@_ZNSt7complexIdEC2ERKdS2_ -// CHECK3-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { +// CHECK3-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]], double* nonnull align 8 dereferenceable(8) [[__RE:%.*]], double* nonnull align 8 dereferenceable(8) [[__IM:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 // CHECK3-NEXT: [[__RE_ADDR:%.*]] = alloca double*, align 8 // CHECK3-NEXT: [[__IM_ADDR:%.*]] = alloca double*, align 8 -// CHECK3-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store double* [[__RE]], double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] // CHECK3-NEXT: store double* [[__IM]], double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 // CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[__RE_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[TMP0]], align 8, !tbaa [[TBAA22]] -// CHECK3-NEXT: store double [[TMP1]], double* [[__RE_]], align 8, !tbaa [[TBAA24]] -// CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[TMP0]], align 8, !tbaa [[TBAA20]] +// CHECK3-NEXT: store double [[TMP1]], double* [[__RE_]], align 8, !tbaa [[TBAA22]] +// CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 // CHECK3-NEXT: [[TMP2:%.*]] = load double*, double** [[__IM_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8, !tbaa [[TBAA22]] -// CHECK3-NEXT: store double [[TMP3]], double* [[__IM_]], align 8, !tbaa [[TBAA26]] +// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8, !tbaa [[TBAA20]] +// CHECK3-NEXT: store double [[TMP3]], double* [[__IM_]], align 8, !tbaa [[TBAA24]] // CHECK3-NEXT: ret void // // // CHECK3-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4realEv -// CHECK3-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK3-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK3-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA24]] +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK3-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP0:%.*]] = load double, double* [[__RE_]], align 8, !tbaa [[TBAA22]] // CHECK3-NEXT: ret double [[TMP0]] // // // CHECK3-LABEL: define {{[^@]+}}@_ZNKSt7complexIdE4imagEv -// CHECK3-SAME: (%"class.std::complex.1"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// CHECK3-SAME: (%"class.std::complex.0"* nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.1"*, align 8 -// CHECK3-NEXT: store %"class.std::complex.1"* [[THIS]], %"class.std::complex.1"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] -// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.1"*, %"class.std::complex.1"** [[THIS_ADDR]], align 8 -// CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.1", %"class.std::complex.1"* [[THIS1]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP0:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA26]] +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %"class.std::complex.0"*, align 8 +// CHECK3-NEXT: store %"class.std::complex.0"* [[THIS]], %"class.std::complex.0"** [[THIS_ADDR]], align 8, !tbaa [[TBAA10]] +// CHECK3-NEXT: [[THIS1:%.*]] = load %"class.std::complex.0"*, %"class.std::complex.0"** [[THIS_ADDR]], align 8 +// CHECK3-NEXT: [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", %"class.std::complex.0"* [[THIS1]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP0:%.*]] = load double, double* [[__IM_]], align 8, !tbaa [[TBAA24]] // CHECK3-NEXT: ret double [[TMP0]] // diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -118,7 +118,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 8 // CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[A_CASTED]] to i8* @@ -218,7 +217,6 @@ // CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8 // CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[AA_CASTED]] to i16* @@ -261,7 +259,6 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -404,7 +401,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4 // CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8* @@ -504,7 +500,6 @@ // CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16* @@ -547,7 +542,6 @@ // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) @@ -690,7 +684,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4 // CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8* @@ -790,7 +783,6 @@ // CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4 // CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16* @@ -833,7 +825,6 @@ // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -2,12 +2,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -35,583 +32,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK1: .execute.fn: -// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .check.next: -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK1-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i64 1) -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK2: .execute.fn: -// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .check.next: -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK2-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* -// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i64 1) -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK3: .execute.fn: -// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .check.next: -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK3-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i32 1) -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -652,8 +72,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 // CHECK4-SAME: () #[[ATTR1:[0-9]+]] { // CHECK4-NEXT: entry: @@ -695,491 +113,1024 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK4-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i32 1) +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK5: .execute.fn: +// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .check.next: +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK5-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK5-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK5: omp.inner.for.cond: +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK5-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK5: omp.inner.for.body: +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) +// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK5: omp.body.continue: +// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK5: omp.inner.for.inc: +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK5: omp.inner.for.end: +// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK5: omp.loop.exit: +// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK6: .execute.fn: +// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .check.next: +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 +// CHECK6-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK6-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK6: omp.inner.for.cond: +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK6-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK6: omp.inner.for.body: +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) +// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK6: omp.body.continue: +// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK6: omp.inner.for.inc: +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK6: omp.inner.for.end: +// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK6: omp.loop.exit: +// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK1: .execute.fn: +// CHECK1-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .check.next: +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK4-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP1]], i16 [[TMP0]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 9 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i32* [[I]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP15]], i32 1) -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]]) -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16 +// CHECK1-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK4-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i32* [[I_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP10]], i64 1) +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[I]]) +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK4-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK5-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK5: .execute.fn: -// CHECK5-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .check.next: -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK5-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK2: .execute.fn: +// CHECK2-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .check.next: +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16 +// CHECK2-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK5-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK5-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK5-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK5-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 -// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* -// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) -// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK5: omp.body.continue: -// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK5-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK5: omp.inner.for.end: -// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK5: omp.loop.exit: -// CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i32* [[I_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP10]], i32 1) +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[I]]) +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK5-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK5-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK5-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker -// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) -// CHECK6-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] -// CHECK6: .execute.fn: -// CHECK6-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .check.next: -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) +// CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] +// CHECK3: .execute.fn: +// CHECK3-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .check.next: +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19 -// CHECK6-SAME: () #[[ATTR1:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l19_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -// CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16 +// CHECK3-SAME: () #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK6-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[I:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP1]], i32 0, i32 0 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 -// CHECK6-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK6-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK6: omp.inner.for.cond: -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK6-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] -// CHECK6-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK6: omp.inner.for.body: -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 -// CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK6-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i32* [[I]] to i8* -// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP12]], i32 1) -// CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK6: omp.body.continue: -// CHECK6-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK6: omp.inner.for.inc: -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK6-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK6-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK6: omp.inner.for.end: -// CHECK6-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK6: omp.loop.exit: -// CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]]) -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP0]]) -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: [[I:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[I_ON_STACK:%.*]] = bitcast i8* [[I]] to i32* +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i32* [[I_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP10]], i32 1) +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1 +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[I]]) +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK6-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[I]], i32** [[I_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[I_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK3-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -// CHECK6-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -// CHECK6-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,26 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK6 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK7 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK8 - // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK10 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix=CHECK11 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK12 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix=CHECK2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK13 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK14 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK15 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK16 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK4 // expected-no-diagnostics #ifndef HEADER @@ -86,6438 +71,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK1-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* -// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] -// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK1: cond.true14: -// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END16:%.*]] -// CHECK1: cond.false15: -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END16]] -// CHECK1: cond.end16: -// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] -// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) -// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK1-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK1: omp.dispatch.end: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK1: cond.true11: -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END13:%.*]] -// CHECK1: cond.false12: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END13]] -// CHECK1: cond.end13: -// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK1: cond.true5: -// CHECK1-NEXT: br label [[COND_END7:%.*]] -// CHECK1: cond.false6: -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END7]] -// CHECK1: cond.end7: -// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK1: cond.true7: -// CHECK1-NEXT: br label [[COND_END9:%.*]] -// CHECK1: cond.false8: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END9]] -// CHECK1: cond.end9: -// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK1-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: land.lhs.true: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK1-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] -// CHECK1-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK1-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] -// CHECK1-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] -// CHECK1: cond.true20: -// CHECK1-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: br label [[COND_END22:%.*]] -// CHECK1: cond.false21: -// CHECK1-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: br label [[COND_END22]] -// CHECK1: cond.end22: -// CHECK1-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] -// CHECK1-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK1-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: land.lhs.true: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 -// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK1-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] -// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 -// CHECK1-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] -// CHECK1-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 -// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] -// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] -// CHECK1-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK1-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK1-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 -// CHECK1-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] -// CHECK1-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 -// CHECK1-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK1-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 -// CHECK1-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] -// CHECK1-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 -// CHECK1-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] -// CHECK1-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] -// CHECK1-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 -// CHECK1-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] -// CHECK1-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 -// CHECK1-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK1-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] -// CHECK1-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK1-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK1-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK1: cond.true11: -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END13:%.*]] -// CHECK1: cond.false12: -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END13]] -// CHECK1: cond.end13: -// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* -// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* -// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] -// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK2: cond.true14: -// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END16:%.*]] -// CHECK2: cond.false15: -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END16]] -// CHECK2: cond.end16: -// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] -// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) -// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 -// CHECK2-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK2: omp.dispatch.cond: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK2: omp.dispatch.body: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK2: omp.dispatch.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK2: omp.dispatch.end: -// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK2: cond.true11: -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END13:%.*]] -// CHECK2: cond.false12: -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END13]] -// CHECK2: cond.end13: -// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK2: cond.true5: -// CHECK2-NEXT: br label [[COND_END7:%.*]] -// CHECK2: cond.false6: -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END7]] -// CHECK2: cond.end7: -// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK2: cond.true7: -// CHECK2-NEXT: br label [[COND_END9:%.*]] -// CHECK2: cond.false8: -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END9]] -// CHECK2: cond.end9: -// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK2-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK2-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK2-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: land.lhs.true: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK2-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK2-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK2-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] -// CHECK2-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK2-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK2-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK2-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] -// CHECK2-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] -// CHECK2: cond.true20: -// CHECK2-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: br label [[COND_END22:%.*]] -// CHECK2: cond.false21: -// CHECK2-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: br label [[COND_END22]] -// CHECK2: cond.end22: -// CHECK2-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] -// CHECK2-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK2-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK2-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] -// CHECK2-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK2-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: land.lhs.true: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK2-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK2-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 -// CHECK2-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 -// CHECK2-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK2-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] -// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK2-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 -// CHECK2-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] -// CHECK2-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 -// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] -// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] -// CHECK2-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK2-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK2-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 -// CHECK2-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] -// CHECK2-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 -// CHECK2-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK2-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 -// CHECK2-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] -// CHECK2-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 -// CHECK2-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] -// CHECK2-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] -// CHECK2-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 -// CHECK2-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] -// CHECK2-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 -// CHECK2-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK2-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK2-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK2-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] -// CHECK2-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK2-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK2-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK2: cond.true11: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END13:%.*]] -// CHECK2: cond.false12: -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END13]] -// CHECK2: cond.end13: -// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK3-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK3-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* -// CHECK3-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK3-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] -// CHECK3-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK3: cond.true14: -// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END16:%.*]] -// CHECK3: cond.false15: -// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END16]] -// CHECK3: cond.end16: -// CHECK3-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] -// CHECK3-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) -// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK3-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK3: omp.dispatch.cond: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK3-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK3-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK3-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK3: omp.dispatch.body: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK3: omp.dispatch.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK3: omp.dispatch.end: -// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK3-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK3: cond.true11: -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END13:%.*]] -// CHECK3: cond.false12: -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END13]] -// CHECK3: cond.end13: -// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK3-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK3-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK3: cond.true5: -// CHECK3-NEXT: br label [[COND_END7:%.*]] -// CHECK3: cond.false6: -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END7]] -// CHECK3: cond.end7: -// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK3-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK3-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK3-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK3-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK3: cond.true7: -// CHECK3-NEXT: br label [[COND_END9:%.*]] -// CHECK3: cond.false8: -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END9]] -// CHECK3: cond.end9: -// CHECK3-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK3-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK3-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK3-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK3-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK3-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I8:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J9:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK3-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: land.lhs.true: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK3-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK3-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK3-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] -// CHECK3: cond.true17: -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: br label [[COND_END19:%.*]] -// CHECK3: cond.false18: -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END19]] -// CHECK3: cond.end19: -// CHECK3-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] -// CHECK3-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK3-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: land.lhs.true: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 -// CHECK3-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK3-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK3-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] -// CHECK3-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK3-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] -// CHECK3-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] -// CHECK3-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK3-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 -// CHECK3-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] -// CHECK3-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK3-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 -// CHECK3-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] -// CHECK3-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] -// CHECK3-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] -// CHECK3-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 -// CHECK3-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] -// CHECK3-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK3-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK3-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK3-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] -// CHECK3-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] -// CHECK3-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK3-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK3-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK3-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK3: cond.true11: -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END13:%.*]] -// CHECK3: cond.false12: -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END13]] -// CHECK3: cond.end13: -// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK3-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK3-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK3-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK3-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK4-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK4-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK4-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK4-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK4-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK4-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK4-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* -// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK4-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK4-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* -// CHECK4-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK4-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK4-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// CHECK4-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] -// CHECK4-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK4: cond.true14: -// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: br label [[COND_END16:%.*]] -// CHECK4: cond.false15: -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END16]] -// CHECK4: cond.end16: -// CHECK4-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] -// CHECK4-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) -// CHECK4-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 -// CHECK4-NEXT: br i1 [[TMP51]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP52:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK4-NEXT: store i32 [[TMP52]], i32* [[CONV1]], align 8 -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK4-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK4: omp.dispatch.cond: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK4-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK4-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK4-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK4-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK4: omp.dispatch.body: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK4-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK4-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK4-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK4: omp.dispatch.inc: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK4: omp.dispatch.end: -// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK4: .omp.lastprivate.then: -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK4-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 -// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK4: .omp.lastprivate.done: -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK4-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK4-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK4-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK4: cond.true11: -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END13:%.*]] -// CHECK4: cond.false12: -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END13]] -// CHECK4: cond.end13: -// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK4-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK4-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK4-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK4-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK4-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK4: cond.true5: -// CHECK4-NEXT: br label [[COND_END7:%.*]] -// CHECK4: cond.false6: -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END7]] -// CHECK4: cond.end7: -// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK4-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK4-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK4-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK4-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK4-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK4-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK4-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK4-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK4-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK4: cond.true7: -// CHECK4-NEXT: br label [[COND_END9:%.*]] -// CHECK4: cond.false8: -// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END9]] -// CHECK4: cond.end9: -// CHECK4-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK4-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK4-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK4-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK4-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK4-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK4-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK4-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK4-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK4-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I8:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J9:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK4-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK4-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: land.lhs.true: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK4-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* -// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK4-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK4-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK4-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK4-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] -// CHECK4: cond.true17: -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: br label [[COND_END19:%.*]] -// CHECK4: cond.false18: -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END19]] -// CHECK4: cond.end19: -// CHECK4-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] -// CHECK4-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I10:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[J11:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] -// CHECK4-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 -// CHECK4-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: land.lhs.true: -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK4-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 -// CHECK4-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 -// CHECK4-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK4-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] -// CHECK4-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK4-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK4-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] -// CHECK4-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] -// CHECK4-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK4-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 -// CHECK4-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] -// CHECK4-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK4-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK4-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 -// CHECK4-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] -// CHECK4-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] -// CHECK4-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] -// CHECK4-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 -// CHECK4-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] -// CHECK4-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 -// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK4-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 -// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 -// CHECK4-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 -// CHECK4-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] -// CHECK4-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] -// CHECK4-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK4-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK4: .execute: -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK4: .omp.deinit: -// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK4: cond.true: -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END:%.*]] -// CHECK4: cond.false: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END]] -// CHECK4: cond.end: -// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK4-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK4-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 -// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK4-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* -// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK4-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] -// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK4: cond.true11: -// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: br label [[COND_END13:%.*]] -// CHECK4: cond.false12: -// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: br label [[COND_END13]] -// CHECK4: cond.end13: -// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] -// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 -// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 -// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK4: omp.precond.then: -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK4-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK4: omp.inner.for.cond: -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK4-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK4-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK4: omp.inner.for.body: -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK4-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 -// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 -// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] -// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK4-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 -// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK4: omp.body.continue: -// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK4: omp.inner.for.inc: -// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK4: omp.inner.for.end: -// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK4: omp.loop.exit: -// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK4-NEXT: br label [[OMP_PRECOND_END]] -// CHECK4: omp.precond.end: -// CHECK4-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -6552,8 +105,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -6707,8 +258,6 @@ // CHECK5-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK5-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -6834,8 +383,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -6864,8 +411,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -6994,8 +539,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7084,8 +627,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7108,8 +649,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7201,8 +740,6 @@ // CHECK5: omp.loop.exit: // CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7267,8 +804,6 @@ // CHECK5: omp.loop.exit: // CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7297,8 +832,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7402,8 +935,6 @@ // CHECK5: omp.loop.exit: // CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7490,8 +1021,6 @@ // CHECK5: omp.loop.exit: // CHECK5-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK5-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7520,8 +1049,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7670,8 +1197,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7807,8 +1332,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7840,8 +1363,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -7976,8 +1497,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -8068,8 +1587,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK6-NEXT: entry: @@ -8104,8 +1621,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8259,8 +1774,6 @@ // CHECK6-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK6-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8386,8 +1899,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8416,8 +1927,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8546,8 +2055,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8636,8 +2143,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8660,8 +2165,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8753,8 +2256,6 @@ // CHECK6: omp.loop.exit: // CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8819,8 +2320,6 @@ // CHECK6: omp.loop.exit: // CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8849,8 +2348,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -8954,8 +2451,6 @@ // CHECK6: omp.loop.exit: // CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9042,8 +2537,6 @@ // CHECK6: omp.loop.exit: // CHECK6-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK6-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9072,8 +2565,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9222,8 +2713,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9359,8 +2848,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9392,8 +2879,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9528,8 +3013,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -9620,8 +3103,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -9656,8 +3137,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -9806,8 +3285,6 @@ // CHECK7: omp.precond.end: // CHECK7-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -9933,8 +3410,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK7-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -9963,8 +3438,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10093,8 +3566,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10183,8 +3654,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10207,8 +3676,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10300,8 +3767,6 @@ // CHECK7: omp.loop.exit: // CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10366,8 +3831,6 @@ // CHECK7: omp.loop.exit: // CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10396,8 +3859,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10501,8 +3962,6 @@ // CHECK7: omp.loop.exit: // CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10589,8 +4048,6 @@ // CHECK7: omp.loop.exit: // CHECK7-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK7-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10619,8 +4076,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10769,8 +4224,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10906,8 +4359,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK7-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -10939,8 +4390,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -11075,8 +4524,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -11167,8 +4614,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK8-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK8-NEXT: entry: @@ -11203,8 +4648,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11353,8 +4796,6 @@ // CHECK8: omp.precond.end: // CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11480,8 +4921,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK8-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11510,8 +4949,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11640,8 +5077,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11730,8 +5165,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11754,8 +5187,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11847,8 +5278,6 @@ // CHECK8: omp.loop.exit: // CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11913,8 +5342,6 @@ // CHECK8: omp.loop.exit: // CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -11943,8 +5370,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12048,8 +5473,6 @@ // CHECK8: omp.loop.exit: // CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12136,8 +5559,6 @@ // CHECK8: omp.loop.exit: // CHECK8-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK8-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12166,8 +5587,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12316,8 +5735,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12453,8 +5870,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK8-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12486,8 +5901,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12622,8 +6035,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -12714,8 +6125,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK9-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -12754,8 +6163,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -12915,8 +6322,6 @@ // CHECK9-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13050,8 +6455,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK9-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13082,8 +6485,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13216,8 +6617,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13311,8 +6710,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13335,8 +6732,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13430,8 +6825,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13500,8 +6893,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13532,8 +6923,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13641,8 +7030,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13735,8 +7122,6 @@ // CHECK9: omp.loop.exit: // CHECK9-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK9-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13767,8 +7152,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -13917,8 +7300,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14054,8 +7435,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK9-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14089,8 +7468,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14229,8 +7606,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -14327,8 +7702,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK10-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: @@ -14367,8 +7740,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -14523,8 +7894,6 @@ // CHECK10: omp.precond.end: // CHECK10-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -14658,8 +8027,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK10-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -14690,8 +8057,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -14824,8 +8189,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -14919,8 +8282,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -14943,8 +8304,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15038,8 +8397,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15108,8 +8465,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15140,8 +8495,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15249,8 +8602,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15343,8 +8694,6 @@ // CHECK10: omp.loop.exit: // CHECK10-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK10-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15375,8 +8724,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15525,8 +8872,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15662,8 +9007,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK10-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15697,8 +9040,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15837,8 +9178,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -15935,8 +9274,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK11-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -15975,8 +9312,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16136,8 +9471,6 @@ // CHECK11-NEXT: [[TMP56:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK11-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP56]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16271,8 +9604,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK11-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16303,8 +9634,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16437,8 +9766,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16532,8 +9859,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16556,8 +9881,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16651,8 +9974,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16721,8 +10042,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16753,8 +10072,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16862,8 +10179,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16956,8 +10271,6 @@ // CHECK11: omp.loop.exit: // CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK11-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -16988,8 +10301,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17137,8 +10448,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17270,8 +10579,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK11-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17305,8 +10612,6 @@ // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17445,8 +10750,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK11-NEXT: entry: @@ -17543,8 +10846,6 @@ // CHECK11-NEXT: br label [[OMP_PRECOND_END]] // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK12-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK12-NEXT: entry: @@ -17583,8 +10884,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -17739,8 +11038,6 @@ // CHECK12: omp.precond.end: // CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -17874,8 +11171,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK12-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -17906,8 +11201,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18040,8 +11333,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18135,8 +11426,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18159,8 +11448,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18254,8 +11541,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18324,8 +11609,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18356,8 +11639,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18465,8 +11746,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18559,8 +11838,6 @@ // CHECK12: omp.loop.exit: // CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK12-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18591,8 +11868,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18740,8 +12015,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18873,8 +12146,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK12-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -18908,8 +12179,6 @@ // CHECK12-NEXT: br label [[DOTEXIT:%.*]] // CHECK12: .exit: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19048,8 +12317,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK12-NEXT: entry: @@ -19146,8 +12413,6 @@ // CHECK12-NEXT: br label [[OMP_PRECOND_END]] // CHECK12: omp.precond.end: // CHECK12-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -19182,8 +12447,6 @@ // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19337,8 +12600,6 @@ // CHECK13-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK13-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19464,8 +12725,6 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK13-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19494,8 +12753,6 @@ // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19624,8 +12881,6 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19714,8 +12969,6 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK13-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19738,8 +12991,6 @@ // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19831,8 +13082,6 @@ // CHECK13: omp.loop.exit: // CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19897,8 +13146,6 @@ // CHECK13: omp.loop.exit: // CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 // CHECK13-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -19927,8 +13174,6 @@ // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20032,8 +13277,6 @@ // CHECK13: omp.loop.exit: // CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20120,8 +13363,6 @@ // CHECK13: omp.loop.exit: // CHECK13-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 // CHECK13-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20150,8 +13391,6 @@ // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__8 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20300,8 +13539,6 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__9 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20437,8 +13674,6 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 // CHECK13-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20470,8 +13705,6 @@ // CHECK13-NEXT: br label [[DOTEXIT:%.*]] // CHECK13: .exit: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__10 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20606,8 +13839,6 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK13-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK13-NEXT: entry: @@ -20698,338 +13929,1281 @@ // CHECK13-NEXT: br label [[OMP_PRECOND_END]] // CHECK13: omp.precond.end: // CHECK13-NEXT: ret void -// -// // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 // CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK14-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK14-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +// CHECK14-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* +// CHECK14-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 +// CHECK14-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK14-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK14-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* +// CHECK14-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +// CHECK14-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] +// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] +// CHECK14-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK14: cond.true11: +// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: br label [[COND_END13:%.*]] +// CHECK14: cond.false12: +// CHECK14-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END13]] +// CHECK14: cond.end13: +// CHECK14-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] +// CHECK14-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) +// CHECK14-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK14-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK14: .omp.lastprivate.then: +// CHECK14-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK14-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK14-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK14: omp.dispatch.cond: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK14: omp.dispatch.body: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK14-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK14: omp.dispatch.inc: +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK14: omp.dispatch.end: +// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK14-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK14: .omp.lastprivate.then: +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK14-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK14-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK14: cond.true10: +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: br label [[COND_END12:%.*]] +// CHECK14: cond.false11: +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END12]] +// CHECK14: cond.end12: +// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: omp.precond.then: +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK14-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK14-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK14-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK14-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK14-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK14-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK14: cond.true5: +// CHECK14-NEXT: br label [[COND_END7:%.*]] +// CHECK14: cond.false6: +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END7]] +// CHECK14: cond.end7: +// CHECK14-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK14-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK14-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK14-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK14: .execute: +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK14-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK14: .omp.deinit: +// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK14-NEXT: br label [[DOTEXIT:%.*]] +// CHECK14: .exit: +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14: cond.true: +// CHECK14-NEXT: br label [[COND_END:%.*]] +// CHECK14: cond.false: +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END]] +// CHECK14: cond.end: +// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK14-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK14-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK14-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK14: cond.true6: +// CHECK14-NEXT: br label [[COND_END8:%.*]] +// CHECK14: cond.false7: +// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: br label [[COND_END8]] +// CHECK14: cond.end8: +// CHECK14-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK14-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK14-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK14-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK14-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK14-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK14-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK14-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK14-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK14-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK14: omp.inner.for.end: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK14-NEXT: ret void +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK14-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK14-NEXT: entry: +// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 // CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) // CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) // CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 // CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 // CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK14-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] // CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK14: .omp.deinit: // CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK14-NEXT: br label [[DOTEXIT:%.*]] // CHECK14: .exit: // CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 // CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 // CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 // CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J10:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 // CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK14-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK14-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK14-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK14-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 // CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 // CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: land.lhs.true: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] // CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK14-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK14-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] // CHECK14: cond.true: -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 // CHECK14-NEXT: br label [[COND_END:%.*]] // CHECK14: cond.false: -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 // CHECK14-NEXT: br label [[COND_END]] // CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK14-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK14-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK14-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK14-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK14-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 // CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 // CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* // CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* // CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* // CHECK14-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK14-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK14-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK14-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK14-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK14-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK14: cond.true11: -// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: br label [[COND_END13:%.*]] -// CHECK14: cond.false12: -// CHECK14-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END13]] -// CHECK14: cond.end13: -// CHECK14-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK14-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK14-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK14-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK14-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK14-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK14: cond.true18: +// CHECK14-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: br label [[COND_END20:%.*]] +// CHECK14: cond.false19: +// CHECK14-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: br label [[COND_END20]] +// CHECK14: cond.end20: +// CHECK14-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK14-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK14-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK14: omp.inner.for.end: // CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK14-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK14-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK14: .omp.lastprivate.then: -// CHECK14-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) // CHECK14-NEXT: br label [[OMP_PRECOND_END]] // CHECK14: omp.precond.end: -// CHECK14-NEXT: [[TMP54:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK14-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP54]]) // CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 // CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 // CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 // CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK14-NEXT: [[J12:%.*]] = alloca i32, align 4 // CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 // CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 // CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 // CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 // CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK14: land.lhs.true: +// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] // CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK14-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK14-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK14: omp.dispatch.cond: -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK14: omp.dispatch.body: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK14-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK14-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK14-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK14-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK14-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK14-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK14-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK14-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK14-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK14-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK14-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK14-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK14-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK14-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK14-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK14-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK14-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK14-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK14-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK14-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK14-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK14-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK14-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK14-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK14-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK14-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK14-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK14-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK14-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 // CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK14: omp.body.continue: // CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK14-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK14-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK14-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK14: omp.dispatch.inc: -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK14-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK14: omp.dispatch.end: -// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK14-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK14: .omp.lastprivate.then: -// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK14-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK14: .omp.lastprivate.done: +// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK14: omp.loop.exit: +// CHECK14-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) // CHECK14-NEXT: br label [[OMP_PRECOND_END]] // CHECK14: omp.precond.end: // CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 // CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) // CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] @@ -21038,23 +15212,23 @@ // CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 // CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK14-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK14-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] // CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK14: .omp.deinit: // CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK14-NEXT: br label [[DOTEXIT:%.*]] // CHECK14: .exit: // CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -21066,12 +15240,13 @@ // CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 // CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 // CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 // CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 @@ -21121,72 +15296,75 @@ // CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 // CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK14-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK14-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK14-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK14-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK14-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK14-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK14-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK14-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK14-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK14-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK14-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK14-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK14-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK14-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) // CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] // CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] // CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] // CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] // CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] // CHECK14: cond.true10: -// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 // CHECK14-NEXT: br label [[COND_END12:%.*]] // CHECK14: cond.false11: -// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK14-NEXT: br label [[COND_END12]] // CHECK14: cond.end12: -// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] // CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK14-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK14: omp.inner.for.end: // CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK14-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) // CHECK14-NEXT: br label [[OMP_PRECOND_END]] // CHECK14: omp.precond.end: // CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 // CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -21202,8 +15380,9 @@ // CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 // CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 // CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 // CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 // CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 // CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 @@ -21231,4117 +15410,9360 @@ // CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 // CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK14-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK14-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK14-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK14-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK14-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void -// -// -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK14-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK14-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK14: cond.true5: -// CHECK14-NEXT: br label [[COND_END7:%.*]] -// CHECK14: cond.false6: -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END7]] -// CHECK14: cond.end7: -// CHECK14-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK14-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK14: omp.inner.for.cond: +// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK14: omp.inner.for.body: +// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK14-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK14-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK14-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK14: omp.body.continue: +// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK14: omp.inner.for.inc: +// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 // CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] // CHECK14: omp.inner.for.end: // CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK14-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK14-NEXT: br label [[OMP_PRECOND_END]] +// CHECK14: omp.precond.end: // CHECK14-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK15-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK15-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK15-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK15-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK15-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK15-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK15-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK15: cond.true11: +// CHECK15-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: br label [[COND_END13:%.*]] +// CHECK15: cond.false12: +// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END13]] +// CHECK15: cond.end13: +// CHECK15-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK15-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK15-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK15-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK15: .omp.lastprivate.then: +// CHECK15-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK15: .omp.lastprivate.done: +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK15-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK15: omp.dispatch.cond: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK15: omp.dispatch.body: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK15-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK15: omp.dispatch.inc: +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK15: omp.dispatch.end: +// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK15-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK15: .omp.lastprivate.then: +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK15: .omp.lastprivate.done: +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK15-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK15-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK15: cond.true10: +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END12:%.*]] +// CHECK15: cond.false11: +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END12]] +// CHECK15: cond.end12: +// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK15-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK15-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK15-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK15-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK15-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK15: cond.true5: +// CHECK15-NEXT: br label [[COND_END7:%.*]] +// CHECK15: cond.false6: +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END7]] +// CHECK15: cond.end7: +// CHECK15-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK15-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK15-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK15-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK15-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK15-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK15-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK15: cond.true6: +// CHECK15-NEXT: br label [[COND_END8:%.*]] +// CHECK15: cond.false7: +// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END8]] +// CHECK15: cond.end8: +// CHECK15-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK15-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK15-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK15-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK15-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK15-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK15-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK15-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK15-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: land.lhs.true: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK15-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK15-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK15-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK15-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK15-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK15-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK15-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK15: cond.true18: +// CHECK15-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: br label [[COND_END20:%.*]] +// CHECK15: cond.false19: +// CHECK15-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: br label [[COND_END20]] +// CHECK15: cond.end20: +// CHECK15-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK15-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK15-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: land.lhs.true: +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK15-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK15-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK15-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK15-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK15-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK15-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK15-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK15-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK15-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK15-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK15-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK15-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK15-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK15-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK15-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK15-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK15-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK15-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK15-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK15-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK15-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK15-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK15-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK15-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK15-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK15-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK15: .execute: +// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK15-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK15: .omp.deinit: +// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK15-NEXT: br label [[DOTEXIT:%.*]] +// CHECK15: .exit: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK15: cond.true: +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END:%.*]] +// CHECK15: cond.false: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END]] +// CHECK15: cond.end: +// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK15-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK15-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK15-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK15-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK15-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK15-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK15-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK15-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK15-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK15: cond.true10: +// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: br label [[COND_END12:%.*]] +// CHECK15: cond.false11: +// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: br label [[COND_END12]] +// CHECK15: cond.end12: +// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK15-NEXT: entry: +// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK15: omp.precond.then: +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK15: omp.inner.for.cond: +// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK15: omp.inner.for.body: +// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK15-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK15-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK15: omp.body.continue: +// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK15: omp.inner.for.inc: +// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK15: omp.inner.for.end: +// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK15: omp.loop.exit: +// CHECK15-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK15-NEXT: br label [[OMP_PRECOND_END]] +// CHECK15: omp.precond.end: +// CHECK15-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 +// CHECK16-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK16-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK16: .execute: +// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK16-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK16: .omp.deinit: +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK16-NEXT: br label [[DOTEXIT:%.*]] +// CHECK16: .exit: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK16-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK16-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK16-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK16-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK16-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK16-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK16-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK16-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK16-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK16-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK16-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK16-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK16-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK16-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK16-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK16-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK16-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK16-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK16-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK16-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK16-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK16-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK16-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK16-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK16-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK16: cond.true11: +// CHECK16-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: br label [[COND_END13:%.*]] +// CHECK16: cond.false12: +// CHECK16-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END13]] +// CHECK16: cond.end13: +// CHECK16-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK16-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK16-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK16-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK16: .omp.lastprivate.then: +// CHECK16-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK16: .omp.lastprivate.done: +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK16-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK16-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK16: omp.dispatch.cond: +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK16-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK16: omp.dispatch.body: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK16-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK16-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK16-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK16: omp.body.continue: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK16: omp.dispatch.inc: +// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK16-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK16: omp.dispatch.end: +// CHECK16-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK16-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK16-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK16: .omp.lastprivate.then: +// CHECK16-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK16-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK16: .omp.lastprivate.done: +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK16-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK16: .execute: +// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK16-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK16: .omp.deinit: +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK16-NEXT: br label [[DOTEXIT:%.*]] +// CHECK16: .exit: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK16-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK16-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK16-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK16-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK16-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK16-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK16-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK16-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK16-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK16-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK16-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK16-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK16-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK16-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK16-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK16-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK16: cond.true10: +// CHECK16-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: br label [[COND_END12:%.*]] +// CHECK16: cond.false11: +// CHECK16-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END12]] +// CHECK16: cond.end12: +// CHECK16-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK16-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK16-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK16-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK16-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK16-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK16-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK16-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK16: omp.body.continue: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK16-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK16-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK16: .execute: +// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK16-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK16: .omp.deinit: +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK16-NEXT: br label [[DOTEXIT:%.*]] +// CHECK16: .exit: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK16-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK16-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK16-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK16-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK16-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK16-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK16-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK16-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK16-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK16-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK16: cond.true5: +// CHECK16-NEXT: br label [[COND_END7:%.*]] +// CHECK16: cond.false6: +// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END7]] +// CHECK16: cond.end7: +// CHECK16-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK16-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK16-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK16-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK16: omp.body.continue: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK16-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK16-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK16-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK16: .execute: +// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK16-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK16: .omp.deinit: +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK16-NEXT: br label [[DOTEXIT:%.*]] +// CHECK16: .exit: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK16-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK16-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK16-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK16-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK16-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK16-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK16-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK16-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK16-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK16-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK16-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK16-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK16-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK16-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK16-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK16: cond.true6: +// CHECK16-NEXT: br label [[COND_END8:%.*]] +// CHECK16: cond.false7: +// CHECK16-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END8]] +// CHECK16: cond.end8: +// CHECK16-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK16-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK16-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK16-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK16-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK16-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK16-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK16-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK16-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK16-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK16-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK16: omp.body.continue: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK16-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 +// CHECK16-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK16: .execute: +// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK16-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK16: .omp.deinit: +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK16-NEXT: br label [[DOTEXIT:%.*]] +// CHECK16: .exit: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK16-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK16-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK16-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK16-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK16-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: land.lhs.true: +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK16-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK16-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK16-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK16-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK16-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK16-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK16-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK16-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK16-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK16-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK16-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK16-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK16-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK16-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK16-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK16-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK16-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK16-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK16-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK16-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK16-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK16-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK16-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK16-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK16-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK16-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK16-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK16-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK16-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK16-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK16: cond.true18: +// CHECK16-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: br label [[COND_END20:%.*]] +// CHECK16: cond.false19: +// CHECK16-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: br label [[COND_END20]] +// CHECK16: cond.end20: +// CHECK16-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK16-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK16-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK16-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK16-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK16-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK16-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK16-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK16-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: land.lhs.true: +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK16-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK16-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK16-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK16-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK16-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK16-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK16-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK16-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK16-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK16-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK16-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK16-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK16-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK16-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK16-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK16-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK16-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK16-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK16-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK16-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK16-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK16-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK16-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK16-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK16-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK16-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK16-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK16-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK16-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK16-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK16-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK16-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK16-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK16-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK16: omp.body.continue: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK16-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK16-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 +// CHECK16-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK16-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK16: .execute: +// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK16-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK16: .omp.deinit: +// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK16-NEXT: br label [[DOTEXIT:%.*]] +// CHECK16: .exit: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK16-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK16-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK16: cond.true: +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: br label [[COND_END:%.*]] +// CHECK16: cond.false: +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END]] +// CHECK16: cond.end: +// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK16-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK16-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK16-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK16-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK16-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK16-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK16-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK16-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK16-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK16-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK16-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK16-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK16-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK16-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK16-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK16-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK16-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK16-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK16-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK16-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK16-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK16: cond.true10: +// CHECK16-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: br label [[COND_END12:%.*]] +// CHECK16: cond.false11: +// CHECK16-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: br label [[COND_END12]] +// CHECK16: cond.end12: +// CHECK16-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK16-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK16-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK16-NEXT: entry: +// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK16-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK16: omp.precond.then: +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK16-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK16: omp.inner.for.cond: +// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK16-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK16-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK16: omp.inner.for.body: +// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK16-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK16-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK16-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK16-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK16: omp.body.continue: +// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK16: omp.inner.for.inc: +// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK16-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK16-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK16: omp.inner.for.end: +// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK16: omp.loop.exit: +// CHECK16-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK16-NEXT: br label [[OMP_PRECOND_END]] +// CHECK16: omp.precond.end: +// CHECK16-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK14-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK14-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK1-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP50]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK14-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK14-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK14-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK14-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK14-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK14-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK14-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK14-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK14-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK14: cond.true6: -// CHECK14-NEXT: br label [[COND_END8:%.*]] -// CHECK14: cond.false7: -// CHECK14-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END8]] -// CHECK14: cond.end8: -// CHECK14-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK14-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK14-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK14-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK14-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK14-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK14-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK14-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK14-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK14-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK14-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: land.lhs.true: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK14-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK14-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK14-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK14-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK14-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK14-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK14-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK14-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK14: cond.true18: -// CHECK14-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: br label [[COND_END20:%.*]] -// CHECK14: cond.false19: -// CHECK14-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: br label [[COND_END20]] -// CHECK14: cond.end20: -// CHECK14-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK14-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK14-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK14-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK1: cond.true5: +// CHECK1-NEXT: br label [[COND_END7:%.*]] +// CHECK1: cond.false6: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END7]] +// CHECK1: cond.end7: +// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK14-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK14-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK14-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK14-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK14-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: land.lhs.true: -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK14-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK14-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK14-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK14-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK14-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK14-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK14-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK14-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK14-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK14-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK14-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK14-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK14-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK14-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK14-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK14-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK14-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK14-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK14-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK14-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK14-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK14-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK14-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK14-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK14-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK14-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK14-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK14-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK14-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK14-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK14-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK14-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK14-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK14-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK14-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK14-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK14-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK14-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK14-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK14-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK14-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK14: .execute: -// CHECK14-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK14-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK14-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK14: .omp.deinit: -// CHECK14-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK14-NEXT: br label [[DOTEXIT:%.*]] -// CHECK14: .exit: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK1: cond.true7: +// CHECK1-NEXT: br label [[COND_END9:%.*]] +// CHECK1: cond.false8: +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END9]] +// CHECK1: cond.end9: +// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK14-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK14-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK14: cond.true: -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: br label [[COND_END:%.*]] -// CHECK14: cond.false: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END]] -// CHECK14: cond.end: -// CHECK14-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK14-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK14-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK14-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK14-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK14-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK14-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK14-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK14-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK14-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK14-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK14-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK14-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK14-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK14-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK14-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK14-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK14-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK14-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK14-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK14-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK14-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK14-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK14-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK14: cond.true10: -// CHECK14-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: br label [[COND_END12:%.*]] -// CHECK14: cond.false11: -// CHECK14-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: br label [[COND_END12]] -// CHECK14: cond.end12: -// CHECK14-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK14-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK14-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK1-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV12]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK1-NEXT: [[CMP14:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV15:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP19]], i32* [[CONV15]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP31]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP32:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP32]], [[TMP33]] +// CHECK1-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK1-NEXT: store i64 [[ADD17]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD18:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: store i64 [[ADD18]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]] +// CHECK1-NEXT: br i1 [[CMP19]], label [[COND_TRUE20:%.*]], label [[COND_FALSE21:%.*]] +// CHECK1: cond.true20: +// CHECK1-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: br label [[COND_END22:%.*]] +// CHECK1: cond.false21: +// CHECK1-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: br label [[COND_END22]] +// CHECK1: cond.end22: +// CHECK1-NEXT: [[COND23:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE20]] ], [ [[TMP41]], [[COND_FALSE21]] ] +// CHECK1-NEXT: store i64 [[COND23]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP42]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK14-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK14-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK14-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK14-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK14-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK14-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK14-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK14-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK14-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK14-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK14-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK14: omp.precond.then: -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK14-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK14-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK14-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK14-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK14-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK14-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK14: omp.inner.for.cond: -// CHECK14-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK14-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK14-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK14: omp.inner.for.body: -// CHECK14-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK14-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK14-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK14-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK14-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK14-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK14-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK14-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK14-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK14-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK14: omp.body.continue: -// CHECK14-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK14: omp.inner.for.inc: -// CHECK14-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK14-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK14-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK14-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK14: omp.inner.for.end: -// CHECK14-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK14: omp.loop.exit: -// CHECK14-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK14-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK14-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK14-NEXT: br label [[OMP_PRECOND_END]] -// CHECK14: omp.precond.end: -// CHECK14-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[DIV]] to i64 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK1-NEXT: [[DIV6:%.*]] = sdiv i32 [[SUB5]], 1 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[DIV6]] to i64 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV7]] +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK1-NEXT: store i64 [[SUB8]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: land.lhs.true: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[CMP9:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK1-NEXT: br i1 [[CMP9]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[TMP8]], i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP9]], i64* [[DOTOMP_UB]], align 8 +// CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK1-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK1-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1 +// CHECK1-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]] +// CHECK1-NEXT: [[CONV16:%.*]] = sext i32 [[MUL15]] to i64 +// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i64 [[TMP15]], [[CONV16]] +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i64 [[DIV17]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL18]] +// CHECK1-NEXT: [[CONV19:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK1-NEXT: store i32 [[CONV19]], i32* [[I10]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB20:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK1-NEXT: [[DIV21:%.*]] = sdiv i32 [[SUB20]], 1 +// CHECK1-NEXT: [[MUL22:%.*]] = mul nsw i32 1, [[DIV21]] +// CHECK1-NEXT: [[CONV23:%.*]] = sext i32 [[MUL22]] to i64 +// CHECK1-NEXT: [[DIV24:%.*]] = sdiv i64 [[TMP18]], [[CONV23]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: [[SUB25:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK1-NEXT: [[DIV26:%.*]] = sdiv i32 [[SUB25]], 1 +// CHECK1-NEXT: [[MUL27:%.*]] = mul nsw i32 1, [[DIV26]] +// CHECK1-NEXT: [[CONV28:%.*]] = sext i32 [[MUL27]] to i64 +// CHECK1-NEXT: [[MUL29:%.*]] = mul nsw i64 [[DIV24]], [[CONV28]] +// CHECK1-NEXT: [[SUB30:%.*]] = sub nsw i64 [[TMP17]], [[MUL29]] +// CHECK1-NEXT: [[MUL31:%.*]] = mul nsw i64 [[SUB30]], 1 +// CHECK1-NEXT: [[ADD32:%.*]] = add nsw i64 0, [[MUL31]] +// CHECK1-NEXT: [[CONV33:%.*]] = trunc i64 [[ADD32]] to i32 +// CHECK1-NEXT: store i32 [[CONV33]], i32* [[J11]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK1-NEXT: [[ADD34:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM35]] +// CHECK1-NEXT: store i32 [[ADD34]], i32* [[ARRAYIDX36]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK1-NEXT: [[ADD37:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK1-NEXT: store i64 [[ADD37]], i64* [[DOTOMP_IV]], align 8 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK15-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK15-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK15-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK15-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK15-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK15-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK15-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK15-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK15-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK15-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK15-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK15: cond.true11: -// CHECK15-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: br label [[COND_END13:%.*]] -// CHECK15: cond.false12: -// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END13]] -// CHECK15: cond.end13: -// CHECK15-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK15-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK15-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK15-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK15: .omp.lastprivate.then: -// CHECK15-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK15: .omp.lastprivate.done: -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK2-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK2-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK2: cond.true14: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END16:%.*]] +// CHECK2: cond.false15: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END16]] +// CHECK2: cond.end16: +// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK2-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP50]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[CONV1]], align 8 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK15-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK15: omp.dispatch.cond: -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK15: omp.dispatch.body: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK15-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK15-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK15: omp.dispatch.inc: -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK15-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK15: omp.dispatch.end: -// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK15-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK15: .omp.lastprivate.then: -// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK15-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK15: .omp.lastprivate.done: -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK15-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK15-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK15-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK15-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK15: cond.true10: -// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END12:%.*]] -// CHECK15: cond.false11: -// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END12]] -// CHECK15: cond.end12: -// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK15-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK15-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK15-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK15-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK2: cond.true5: +// CHECK2-NEXT: br label [[COND_END7:%.*]] +// CHECK2: cond.false6: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END7]] +// CHECK2: cond.end7: +// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK15-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK15-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK15: cond.true5: -// CHECK15-NEXT: br label [[COND_END7:%.*]] -// CHECK15: cond.false6: -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END7]] -// CHECK15: cond.end7: -// CHECK15-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK15-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK2: cond.true7: +// CHECK2-NEXT: br label [[COND_END9:%.*]] +// CHECK2: cond.false8: +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END9]] +// CHECK2: cond.end9: +// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK15-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK15-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK15-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK2-SAME: (i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I8:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J9:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP14]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK2-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV12:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP21]], i32* [[CONV12]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i64 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP16]], label [[COND_TRUE17:%.*]], label [[COND_FALSE18:%.*]] +// CHECK2: cond.true17: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: br label [[COND_END19:%.*]] +// CHECK2: cond.false18: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END19]] +// CHECK2: cond.end19: +// CHECK2-NEXT: [[COND20:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE17]] ], [ [[TMP43]], [[COND_FALSE18]] ] +// CHECK2-NEXT: store i32 [[COND20]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK15-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK15-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK15-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK15-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK15-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK15-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK15-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK15: cond.true6: -// CHECK15-NEXT: br label [[COND_END8:%.*]] -// CHECK15: cond.false7: -// CHECK15-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END8]] -// CHECK15: cond.end8: -// CHECK15-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK15-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I10:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J11:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]] +// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1 +// CHECK2-NEXT: store i32 [[SUB6]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: land.lhs.true: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP8]] to i32 +// CHECK2-NEXT: [[TMP9:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV9:%.*]] = trunc i64 [[TMP9]] to i32 +// CHECK2-NEXT: store i32 [[CONV8]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV9]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV12:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK2-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP13:%.*]] = icmp ule i64 [[CONV12]], [[TMP14]] +// CHECK2-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK2-NEXT: [[MUL16:%.*]] = mul nsw i32 1, [[DIV15]] +// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[TMP15]], [[MUL16]] +// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I10]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK2-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1 +// CHECK2-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]] +// CHECK2-NEXT: [[DIV22:%.*]] = sdiv i32 [[TMP18]], [[MUL21]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[SUB23:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK2-NEXT: [[DIV24:%.*]] = sdiv i32 [[SUB23]], 1 +// CHECK2-NEXT: [[MUL25:%.*]] = mul nsw i32 1, [[DIV24]] +// CHECK2-NEXT: [[MUL26:%.*]] = mul nsw i32 [[DIV22]], [[MUL25]] +// CHECK2-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP17]], [[MUL26]] +// CHECK2-NEXT: [[MUL28:%.*]] = mul nsw i32 [[SUB27]], 1 +// CHECK2-NEXT: [[ADD29:%.*]] = add nsw i32 0, [[MUL28]] +// CHECK2-NEXT: store i32 [[ADD29]], i32* [[J11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK2-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[I10]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[J11]], align 4 +// CHECK2-NEXT: [[IDXPROM31:%.*]] = sext i32 [[TMP24]] to i64 +// CHECK2-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM31]] +// CHECK2-NEXT: store i32 [[ADD30]], i32* [[ARRAYIDX32]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] +// CHECK2-NEXT: store i32 [[ADD33]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK15-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK15-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK15-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK15-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK15-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK15-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK15-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 8 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 8 +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 8 +// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 +// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP20]] to i8* +// CHECK2-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP33]], i64 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP40]], [[TMP41]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP42]], [[COND_TRUE11]] ], [ [[TMP43]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP44]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: land.lhs.true: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK15-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK15-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK15-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK15-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK15-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK15-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK15-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK15-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK15-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK15-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK15: cond.true18: -// CHECK15-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: br label [[COND_END20:%.*]] -// CHECK15: cond.false19: -// CHECK15-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: br label [[COND_END20]] -// CHECK15: cond.end20: -// CHECK15-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK15-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK15-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK15-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 8 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 8 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i64 [[IDXPROM]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK2-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM8]] +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX9]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK15-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK15-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK15-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK15-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK15-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: land.lhs.true: -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK15-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK15-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK15-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK15-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK15-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK15-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK15-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK15-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK15-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK15-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK15-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK15-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK15-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK15-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK15-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK15-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK15-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK15-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK15-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK15-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK15-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK15-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK15-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK15-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK15-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK15-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK15-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK15-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK15-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK15-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK15-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK15-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK15-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK15-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK15-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK15-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK15-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK15-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK15-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK3-NEXT: br i1 [[TMP47]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP48]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK15-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK15-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK15: .execute: -// CHECK15-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK15-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK15-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK15: .omp.deinit: -// CHECK15-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK15-NEXT: br label [[DOTEXIT:%.*]] -// CHECK15: .exit: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK15-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK15-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK15: cond.true: -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END:%.*]] -// CHECK15: cond.false: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END]] -// CHECK15: cond.end: -// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK15-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK15-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK15-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK15-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK15-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK15-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK15-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK15-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK15-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK15-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK15-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK15-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK15-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK15-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK15-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK15-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK15-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK15-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK15-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK15-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK15-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK15-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK15: cond.true10: -// CHECK15-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: br label [[COND_END12:%.*]] -// CHECK15: cond.false11: -// CHECK15-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: br label [[COND_END12]] -// CHECK15: cond.end12: -// CHECK15-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK15-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK15-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK15-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK15-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK15-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK15-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK15-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK15-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK15-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK15-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK15-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK15: omp.precond.then: -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK15-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK15-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK15-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK15-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK15-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK15-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK15: omp.inner.for.cond: -// CHECK15-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK15-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK15-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK15: omp.inner.for.body: -// CHECK15-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK15-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK15-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK15-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK15-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK15-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK15: omp.body.continue: -// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK15: omp.inner.for.inc: -// CHECK15-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK15-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK15-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK15: omp.inner.for.end: -// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK15: omp.loop.exit: -// CHECK15-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK15-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK15-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK15-NEXT: br label [[OMP_PRECOND_END]] -// CHECK15: omp.precond.end: -// CHECK15-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43 -// CHECK16-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK16-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK16: .execute: -// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK16-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK16: .omp.deinit: -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK16-NEXT: br label [[DOTEXIT:%.*]] -// CHECK16: .exit: -// CHECK16-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK16-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK16-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK16-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK16-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK16-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK16-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK16-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK16-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK16-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK16-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK16-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK16-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK16-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK16-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK16-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK16-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK16-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK16-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK16-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK16-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK16-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK16-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK16-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK16-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK16-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK16: cond.true11: -// CHECK16-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: br label [[COND_END13:%.*]] -// CHECK16: cond.false12: -// CHECK16-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END13]] -// CHECK16: cond.end13: -// CHECK16-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK16-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK16-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK16-NEXT: br i1 [[TMP49]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK16: .omp.lastprivate.then: -// CHECK16-NEXT: [[TMP50:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP50]], i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK16: .omp.lastprivate.done: -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK16-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK3: cond.true5: +// CHECK3-NEXT: br label [[COND_END7:%.*]] +// CHECK3: cond.false6: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END7]] +// CHECK3: cond.end7: +// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK16-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK16-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK16: omp.dispatch.cond: -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK16-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK16: omp.dispatch.body: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK16-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK16-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK16-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK16: omp.body.continue: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK16: omp.dispatch.inc: -// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK16-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK16: omp.dispatch.end: -// CHECK16-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK16-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK16-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK16: .omp.lastprivate.then: -// CHECK16-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 -// CHECK16-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK16: .omp.lastprivate.done: -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK3: cond.true6: +// CHECK3-NEXT: br label [[COND_END8:%.*]] +// CHECK3: cond.false7: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END8]] +// CHECK3: cond.end8: +// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK16-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK16: .execute: -// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK16-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK16: .omp.deinit: -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK16-NEXT: br label [[DOTEXIT:%.*]] -// CHECK16: .exit: -// CHECK16-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK3-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK16-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK16-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK16-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK16-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK16-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK16-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK16-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK16-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK16-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK16-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK16-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK16-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK16-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK16-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK16-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK16-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK16: cond.true10: -// CHECK16-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: br label [[COND_END12:%.*]] -// CHECK16: cond.false11: -// CHECK16-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END12]] -// CHECK16: cond.end12: -// CHECK16-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK16-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK3-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: land.lhs.true: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK3-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK3-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK3-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK3: cond.true18: +// CHECK3-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: br label [[COND_END20:%.*]] +// CHECK3: cond.false19: +// CHECK3-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: br label [[COND_END20]] +// CHECK3: cond.end20: +// CHECK3-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK3-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK3-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK3-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK3-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: land.lhs.true: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK3-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK3-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK3-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK3-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK3-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK3-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK3-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK3-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK3-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK3-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK3-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK3-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK3-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK3-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK3-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK3-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK3-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK3-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK3-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK3-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK3-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK3-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK3-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK3-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK3-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK3-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK3-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK3-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK3-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK3-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK3-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK3-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK3-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK16-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK16-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK16-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK16-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK16-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK16-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK16: omp.body.continue: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK16-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK16-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK16: .execute: -// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK16-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK16: .omp.deinit: -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK16-NEXT: br label [[DOTEXIT:%.*]] -// CHECK16: .exit: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK16-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK16-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK16-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK16-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK16-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK16-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK16-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK16-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK16-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK16-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK16: cond.true5: -// CHECK16-NEXT: br label [[COND_END7:%.*]] -// CHECK16: cond.false6: -// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END7]] -// CHECK16: cond.end7: -// CHECK16-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK16-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK4-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK4-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK4-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK4: cond.true11: +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: br label [[COND_END13:%.*]] +// CHECK4: cond.false12: +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END13]] +// CHECK4: cond.end13: +// CHECK4-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK4-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK4-NEXT: br i1 [[TMP47]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP48:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP48]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK4: omp.dispatch.cond: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK4: omp.dispatch.body: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK4-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK4: omp.dispatch.inc: +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK4: omp.dispatch.end: +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK4-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK4: .omp.lastprivate.then: +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP30]], i32* [[L_ADDR]], align 4 +// CHECK4-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK4: .omp.lastprivate.done: +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK16-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK16-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK16: omp.body.continue: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK16-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 -// CHECK16-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK16-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK16: .execute: -// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK16-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK16: .omp.deinit: -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK16-NEXT: br label [[DOTEXIT:%.*]] -// CHECK16: .exit: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK4-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK16-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK16-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK16-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK16-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK16-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK16-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK16-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK16-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK16-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK16-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK16-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK16-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK16-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK16-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK16-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK16: cond.true6: -// CHECK16-NEXT: br label [[COND_END8:%.*]] -// CHECK16: cond.false7: -// CHECK16-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END8]] -// CHECK16: cond.end8: -// CHECK16-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK16-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39 +// CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK16-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK16-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK16-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK16-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK16-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK16-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK16-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK16-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK16-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK16: omp.body.continue: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK16-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK4-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK4: cond.true5: +// CHECK4-NEXT: br label [[COND_END7:%.*]] +// CHECK4: cond.false6: +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END7]] +// CHECK4: cond.end7: +// CHECK4-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK4-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l67 -// CHECK16-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK16: .execute: -// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK16-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] -// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK16: .omp.deinit: -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK16-NEXT: br label [[DOTEXIT:%.*]] -// CHECK16: .exit: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK4-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK4-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__8 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[J10:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK16-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK16-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK16-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK16-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK16-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: land.lhs.true: -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK16-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK16-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) -// CHECK16-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] -// CHECK16-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK16-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 -// CHECK16-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] -// CHECK16-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK16-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 -// CHECK16-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 -// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* -// CHECK16-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK16-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK16-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK16-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK16-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK16-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK16-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK16-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK16-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK16-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK16-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 -// CHECK16-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK16-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] -// CHECK16-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK16-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK16-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] -// CHECK16-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK16-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK16-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] -// CHECK16-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] -// CHECK16-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] -// CHECK16: cond.true18: -// CHECK16-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: br label [[COND_END20:%.*]] -// CHECK16: cond.false19: -// CHECK16-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: br label [[COND_END20]] -// CHECK16: cond.end20: -// CHECK16-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] -// CHECK16-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 -// CHECK16-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 -// CHECK16-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I11:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[J12:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK16-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 -// CHECK16-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] -// CHECK16-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 -// CHECK16-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[J]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK16-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: land.lhs.true: -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] -// CHECK16-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 -// CHECK16-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 -// CHECK16-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK16-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 -// CHECK16-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 -// CHECK16-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) -// CHECK16-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 -// CHECK16-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK16-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] -// CHECK16-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 -// CHECK16-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK16-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] -// CHECK16-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 -// CHECK16-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] -// CHECK16-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] -// CHECK16-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 -// CHECK16-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 -// CHECK16-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 -// CHECK16-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] -// CHECK16-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 -// CHECK16-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] -// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK16-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 -// CHECK16-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 -// CHECK16-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] -// CHECK16-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 -// CHECK16-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] -// CHECK16-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] -// CHECK16-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 -// CHECK16-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] -// CHECK16-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 -// CHECK16-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 -// CHECK16-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK16-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK16-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] -// CHECK16-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 -// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] -// CHECK16-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 -// CHECK16-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] -// CHECK16-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 -// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK16: omp.body.continue: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 -// CHECK16-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] -// CHECK16-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK4-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK4-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK4-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK4-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK4-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK4-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK4: cond.true6: +// CHECK4-NEXT: br label [[COND_END8:%.*]] +// CHECK4: cond.false7: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END8]] +// CHECK4: cond.end8: +// CHECK4-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK4-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK4-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK4-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK4-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK4-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK4-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK4-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK4-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52 +// CHECK4-SAME: (i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74 -// CHECK16-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK16-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK16-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK16: .execute: -// CHECK16-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK16-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] -// CHECK16-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK16: .omp.deinit: -// CHECK16-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK16-NEXT: br label [[DOTEXIT:%.*]] -// CHECK16: .exit: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__8 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J10:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: land.lhs.true: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB1]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_COMB_LB]], i64* [[DOTOMP_COMB_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i64 [[COND]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP14]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1 +// CHECK4-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +// CHECK4-NEXT: [[TMP19:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP30:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP31]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [10 x [10 x i32]]*)* @__omp_outlined__9 to i8*), i8* null, i8** [[TMP33]], i32 4) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP34:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP35:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP34]], [[TMP35]] +// CHECK4-NEXT: store i64 [[ADD14]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP36:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP37:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: store i64 [[ADD15]], i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: [[TMP38:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP39:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: store i64 [[ADD16]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP40:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP41:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP40]], [[TMP41]] +// CHECK4-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]] +// CHECK4: cond.true18: +// CHECK4-NEXT: [[TMP42:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: br label [[COND_END20:%.*]] +// CHECK4: cond.false19: +// CHECK4-NEXT: [[TMP43:%.*]] = load i64, i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: br label [[COND_END20]] +// CHECK4: cond.end20: +// CHECK4-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP42]], [[COND_TRUE18]] ], [ [[TMP43]], [[COND_FALSE19]] ] +// CHECK4-NEXT: store i64 [[COND21]], i64* [[DOTOMP_COMB_UB]], align 8 +// CHECK4-NEXT: [[TMP44:%.*]] = load i64, i64* [[DOTOMP_COMB_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP44]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP45:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP46]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK16-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK16-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK16-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK16-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK16: cond.true: -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: br label [[COND_END:%.*]] -// CHECK16: cond.false: -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END]] -// CHECK16: cond.end: -// CHECK16-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK16-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK16-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK16-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK16-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK16-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK16-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 -// CHECK16-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK16-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK16-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 -// CHECK16-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK16-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK16-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 -// CHECK16-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK16-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK16-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK16-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK16-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* -// CHECK16-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK16-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -// CHECK16-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK16-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK16-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] -// CHECK16-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] -// CHECK16-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] -// CHECK16-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK16: cond.true10: -// CHECK16-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: br label [[COND_END12:%.*]] -// CHECK16: cond.false11: -// CHECK16-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: br label [[COND_END12]] -// CHECK16: cond.end12: -// CHECK16-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] -// CHECK16-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK16-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I11:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[J12:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1 +// CHECK4-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]] +// CHECK4-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1 +// CHECK4-NEXT: store i64 [[SUB7]], i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[J]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK4-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: land.lhs.true: +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]] +// CHECK4-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i64 0, i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_3]], align 8 +// CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_UB]], align 8 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64 +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK4-NEXT: store i64 [[CONV9]], i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: store i64 [[CONV10]], i64* [[DOTOMP_UB]], align 8 +// CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @[[GLOB2]], i32 [[TMP11]], i32 33, i32* [[DOTOMP_IS_LAST]], i64* [[DOTOMP_LB]], i64* [[DOTOMP_UB]], i64* [[DOTOMP_STRIDE]], i64 1, i64 1) +// CHECK4-NEXT: [[TMP12:%.*]] = load i64, i64* [[DOTOMP_LB]], align 8 +// CHECK4-NEXT: store i64 [[TMP12]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP13:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK4-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]] +// CHECK4-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP15:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0 +// CHECK4-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK4-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]] +// CHECK4-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64 +// CHECK4-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]] +// CHECK4-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]] +// CHECK4-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32 +// CHECK4-NEXT: store i32 [[CONV21]], i32* [[I11]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP18:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0 +// CHECK4-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1 +// CHECK4-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]] +// CHECK4-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64 +// CHECK4-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]] +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK4-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0 +// CHECK4-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1 +// CHECK4-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]] +// CHECK4-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64 +// CHECK4-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]] +// CHECK4-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]] +// CHECK4-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1 +// CHECK4-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]] +// CHECK4-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32 +// CHECK4-NEXT: store i32 [[CONV35]], i32* [[J12]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK4-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]] +// CHECK4-NEXT: [[TMP23:%.*]] = load i32, i32* [[I11]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP23]] +// CHECK4-NEXT: [[TMP24:%.*]] = load i32, i32* [[J12]], align 4 +// CHECK4-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP24]] +// CHECK4-NEXT: store i32 [[ADD36]], i32* [[ARRAYIDX37]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP25:%.*]] = load i64, i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: [[TMP26:%.*]] = load i64, i64* [[DOTOMP_STRIDE]], align 8 +// CHECK4-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]] +// CHECK4-NEXT: store i64 [[ADD38]], i64* [[DOTOMP_IV]], align 8 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // // -// CHECK16-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK16-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { -// CHECK16-NEXT: entry: -// CHECK16-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK16-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 -// CHECK16-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK16-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 -// CHECK16-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK16-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK16-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK16-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK16-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK16-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK16-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK16-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK16: omp.precond.then: -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK16-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK16-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK16-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK16-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK16-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK16-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK16: omp.inner.for.cond: -// CHECK16-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK16-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK16-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK16: omp.inner.for.body: -// CHECK16-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK16-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK16-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK16-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 -// CHECK16-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK16-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] -// CHECK16-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK16-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK16-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] -// CHECK16-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 -// CHECK16-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK16: omp.body.continue: -// CHECK16-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK16: omp.inner.for.inc: -// CHECK16-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK16-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK16-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK16-NEXT: br label [[OMP_INNER_FOR_COND]] -// CHECK16: omp.inner.for.end: -// CHECK16-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK16: omp.loop.exit: -// CHECK16-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK16-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK16-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) -// CHECK16-NEXT: br label [[OMP_PRECOND_END]] -// CHECK16: omp.precond.end: -// CHECK16-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59 +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK4: .execute: +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32* [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK4: .omp.deinit: +// CHECK4-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK4: cond.true: +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END:%.*]] +// CHECK4: cond.false: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END]] +// CHECK4: cond.end: +// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK4-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK4-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK4-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 4 +// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK4-NEXT: store i8* [[TMP22]], i8** [[TMP21]], align 4 +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK4-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK4-NEXT: store i8* [[TMP24]], i8** [[TMP23]], align 4 +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK4-NEXT: [[TMP26:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK4-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK4-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP18]] to i8* +// CHECK4-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 +// CHECK4-NEXT: [[TMP29:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 +// CHECK4-NEXT: [[TMP31:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32*)* @__omp_outlined__11 to i8*), i8* null, i8** [[TMP31]], i32 5) +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP34]], [[TMP35]] +// CHECK4-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP36]], [[TMP37]] +// CHECK4-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]] +// CHECK4-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK4: cond.true10: +// CHECK4-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: br label [[COND_END12:%.*]] +// CHECK4: cond.false11: +// CHECK4-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: br label [[COND_END12]] +// CHECK4: cond.end12: +// CHECK4-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE10]] ], [ [[TMP41]], [[COND_FALSE11]] ] +// CHECK4-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK4-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP42]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP43:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP43]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32* [[V:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK4-NEXT: [[V_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK4-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK4: omp.precond.then: +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK4-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK4: omp.inner.for.cond: +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK4-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK4-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK4: omp.inner.for.body: +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK4-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[V_ADDR]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 [[TMP14]] +// CHECK4-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK4-NEXT: [[TMP16:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK4-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP16]] +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX5]], align 4 +// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK4: omp.body.continue: +// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK4: omp.inner.for.inc: +// CHECK4-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK4-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK4: omp.inner.for.end: +// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK4: omp.loop.exit: +// CHECK4-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4 +// CHECK4-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP20]]) +// CHECK4-NEXT: br label [[OMP_PRECOND_END]] +// CHECK4: omp.precond.end: +// CHECK4-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -1,22 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6 - // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK7 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK8 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK9 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK10 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK11 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK12 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 // expected-no-diagnostics #ifndef HEADER @@ -71,3251 +59,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 0 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP23]], i32* [[CONV8]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP25]], i32* [[CONV9]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP20]] to i8* -// CHECK1-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP22]] to i8* -// CHECK1-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 8 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP24]] to i8* -// CHECK1-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 8 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK1-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP26]] to i8* -// CHECK1-NEXT: store i8* [[TMP36]], i8** [[TMP35]], align 8 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -// CHECK1-NEXT: [[TMP39:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP38]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP39]], i64 5) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP44]], [[TMP45]] -// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP46]], [[TMP47]] -// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK1: cond.true14: -// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK1-NEXT: br label [[COND_END16:%.*]] -// CHECK1: cond.false15: -// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END16]] -// CHECK1: cond.end16: -// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP48]], [[COND_TRUE14]] ], [ [[TMP49]], [[COND_FALSE15]] ] -// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP50]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP51:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[TMP51]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) -// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK1-NEXT: br i1 [[TMP54]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP55]], 0 -// CHECK1-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 -// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: [[TMP56:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 -// CHECK1-NEXT: br i1 [[TMP57]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP58]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: [[TMP59:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP59]]) -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK1: omp.dispatch.cond: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK1: omp.dispatch.body: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK1: omp.dispatch.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK1: omp.dispatch.end: -// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 -// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK1-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK1: .omp.lastprivate.then: -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK1-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 -// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK1: .omp.lastprivate.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK1: cond.true11: -// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: br label [[COND_END13:%.*]] -// CHECK1: cond.false12: -// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END13]] -// CHECK1: cond.end13: -// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 -// CHECK1-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 -// CHECK1-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK1: omp.precond.then: -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK1-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 -// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 -// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] -// CHECK1-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: br label [[OMP_PRECOND_END]] -// CHECK1: omp.precond.end: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK1: cond.true5: -// CHECK1-NEXT: br label [[COND_END7:%.*]] -// CHECK1: cond.false6: -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END7]] -// CHECK1: cond.end7: -// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 -// CHECK1-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK1-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK1: cond.true7: -// CHECK1-NEXT: br label [[COND_END9:%.*]] -// CHECK1: cond.false8: -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: br label [[COND_END9]] -// CHECK1: cond.end9: -// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK1: omp.body.continue: -// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK1: omp.inner.for.end: -// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK1: omp.loop.exit: -// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK1: .omp.final.then: -// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK1: .omp.final.done: -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[L2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV8]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP22]], i32* [[CONV9]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i64, i64* [[L_CASTED]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP21]] to i8* -// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 -// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP31:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 -// CHECK2-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP23]] to i8* -// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP32]], align 8 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP35]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP36]], i64 5) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// CHECK2-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP43]], [[TMP44]] -// CHECK2-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] -// CHECK2: cond.true14: -// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK2-NEXT: br label [[COND_END16:%.*]] -// CHECK2: cond.false15: -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END16]] -// CHECK2: cond.end16: -// CHECK2-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP45]], [[COND_TRUE14]] ], [ [[TMP46]], [[COND_FALSE15]] ] -// CHECK2-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP47]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP48:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP49]]) -// CHECK2-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0 -// CHECK2-NEXT: br i1 [[TMP51]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP52]], 0 -// CHECK2-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 -// CHECK2-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP54:%.*]] = icmp ne i32 [[TMP53]], 0 -// CHECK2-NEXT: br i1 [[TMP54]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP55:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP55]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 -// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK2: omp.dispatch.cond: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 -// CHECK2-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK2: omp.dispatch.body: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK2-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK2: omp.dispatch.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK2: omp.dispatch.end: -// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK2-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 -// CHECK2-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 -// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] -// CHECK2-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK2: .omp.lastprivate.then: -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 -// CHECK2-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 -// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK2: .omp.lastprivate.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* -// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* -// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* -// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 -// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK2: cond.true11: -// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: br label [[COND_END13:%.*]] -// CHECK2: cond.false12: -// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END13]] -// CHECK2: cond.end13: -// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] -// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) -// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 -// CHECK2-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 -// CHECK2-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I5:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 -// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* -// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK2: omp.precond.then: -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 -// CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] -// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 -// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 -// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK2-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK2-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 -// CHECK2-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 -// CHECK2-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] -// CHECK2-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: br label [[OMP_PRECOND_END]] -// CHECK2: omp.precond.end: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 -// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK2: cond.true5: -// CHECK2-NEXT: br label [[COND_END7:%.*]] -// CHECK2: cond.false6: -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END7]] -// CHECK2: cond.end7: -// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] -// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 -// CHECK2-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK2-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* -// CHECK2-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* -// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 -// CHECK2-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] -// CHECK2-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 -// CHECK2-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] -// CHECK2: cond.true7: -// CHECK2-NEXT: br label [[COND_END9:%.*]] -// CHECK2: cond.false8: -// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: br label [[COND_END9]] -// CHECK2: cond.end9: -// CHECK2-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] -// CHECK2-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 -// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 -// CHECK2-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 -// CHECK2-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 -// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK2: omp.inner.for.cond: -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 -// CHECK2-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 -// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] -// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK2: omp.inner.for.body: -// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 -// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] -// CHECK2-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] -// CHECK2-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 -// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] -// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK2-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 -// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] -// CHECK2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 -// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK2: omp.body.continue: -// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK2: omp.inner.for.inc: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 -// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK2: omp.inner.for.end: -// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK2: omp.loop.exit: -// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK2: .omp.final.then: -// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK2: .omp.final.done: -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP16]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP17]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP21]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP23]], i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK3-NEXT: store i8* [[TMP26]], i8** [[TMP25]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP20]] to i8* -// CHECK3-NEXT: store i8* [[TMP28]], i8** [[TMP27]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP30:%.*]] = inttoptr i32 [[TMP22]] to i8* -// CHECK3-NEXT: store i8* [[TMP30]], i8** [[TMP29]], align 4 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP32]], i8** [[TMP31]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK3-NEXT: [[TMP34:%.*]] = inttoptr i32 [[TMP24]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP33]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -// CHECK3-NEXT: [[TMP37:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP36]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP37]], i32 5) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP38]], [[TMP39]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP40]], [[TMP41]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]] -// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK3: cond.true11: -// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK3-NEXT: br label [[COND_END13:%.*]] -// CHECK3: cond.false12: -// CHECK3-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END13]] -// CHECK3: cond.end13: -// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP46]], [[COND_TRUE11]] ], [ [[TMP47]], [[COND_FALSE12]] ] -// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP48]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP49:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP50]]) -// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK3-NEXT: br i1 [[TMP52]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP53:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP53]], 0 -// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: [[TMP54:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 -// CHECK3-NEXT: br i1 [[TMP55]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP56:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP56]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK3: omp.dispatch.cond: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK3: omp.dispatch.body: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK3: omp.dispatch.inc: -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK3: omp.dispatch.end: -// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK3-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK3-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK3-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK3-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK3: .omp.lastprivate.then: -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK3: .omp.lastprivate.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK3: cond.true10: -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: br label [[COND_END12:%.*]] -// CHECK3: cond.false11: -// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END12]] -// CHECK3: cond.end12: -// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK3-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK3: omp.precond.then: -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK3-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK3-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK3-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK3-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: br label [[OMP_PRECOND_END]] -// CHECK3: omp.precond.end: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK3: cond.true5: -// CHECK3-NEXT: br label [[COND_END7:%.*]] -// CHECK3: cond.false6: -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END7]] -// CHECK3: cond.end7: -// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK3-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK3: cond.true6: -// CHECK3-NEXT: br label [[COND_END8:%.*]] -// CHECK3: cond.false7: -// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: br label [[COND_END8]] -// CHECK3: cond.end8: -// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK3-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK3: omp.body.continue: -// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK3: omp.inner.for.end: -// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK3: omp.loop.exit: -// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK3: .omp.final.then: -// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK3: .omp.final.done: -// CHECK3-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK4-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -3350,8 +93,6 @@ // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -3517,8 +258,6 @@ // CHECK4-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -3656,8 +395,6 @@ // CHECK4-NEXT: br label [[OMP_PRECOND_END]] // CHECK4: omp.precond.end: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -3686,8 +423,6 @@ // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -3828,8 +563,6 @@ // CHECK4-NEXT: br label [[OMP_PRECOND_END]] // CHECK4: omp.precond.end: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -3930,8 +663,6 @@ // CHECK4-NEXT: br label [[OMP_PRECOND_END]] // CHECK4: omp.precond.end: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK4-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -3954,8 +685,6 @@ // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4054,8 +783,6 @@ // CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK4: .omp.final.done: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4127,8 +854,6 @@ // CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK4: .omp.final.done: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK4-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4157,8 +882,6 @@ // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4270,8 +993,6 @@ // CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK4: .omp.final.done: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4366,8 +1087,6 @@ // CHECK4-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK4: .omp.final.done: // CHECK4-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK5-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -4402,8 +1121,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -4564,8 +1281,6 @@ // CHECK5: omp.precond.end: // CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -4703,8 +1418,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK5-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -4733,8 +1446,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -4875,8 +1586,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -4977,8 +1686,6 @@ // CHECK5-NEXT: br label [[OMP_PRECOND_END]] // CHECK5: omp.precond.end: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK5-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -5001,8 +1708,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -5101,8 +1806,6 @@ // CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK5: .omp.final.done: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -5174,8 +1877,6 @@ // CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK5: .omp.final.done: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK5-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -5204,8 +1905,6 @@ // CHECK5-NEXT: br label [[DOTEXIT:%.*]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -5317,8 +2016,6 @@ // CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK5: .omp.final.done: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -5413,8 +2110,6 @@ // CHECK5-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK5: .omp.final.done: // CHECK5-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK6-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK6-NEXT: entry: @@ -5449,8 +2144,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -5611,8 +2304,6 @@ // CHECK6: omp.precond.end: // CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -5750,8 +2441,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK6-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -5780,8 +2469,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -5922,8 +2609,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6024,8 +2709,6 @@ // CHECK6-NEXT: br label [[OMP_PRECOND_END]] // CHECK6: omp.precond.end: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK6-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6048,8 +2731,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6148,8 +2829,6 @@ // CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK6: .omp.final.done: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6221,8 +2900,6 @@ // CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK6: .omp.final.done: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK6-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6251,8 +2928,6 @@ // CHECK6-NEXT: br label [[DOTEXIT:%.*]] // CHECK6: .exit: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6364,8 +3039,6 @@ // CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK6: .omp.final.done: // CHECK6-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK6-NEXT: entry: @@ -6460,8 +3133,6 @@ // CHECK6-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK6: .omp.final.done: // CHECK6-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK7-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -6500,8 +3171,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6673,8 +3342,6 @@ // CHECK7-NEXT: [[TMP59:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP59]]) // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6820,8 +3487,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK7-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6852,8 +3517,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -6998,8 +3661,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7105,8 +3766,6 @@ // CHECK7-NEXT: br label [[OMP_PRECOND_END]] // CHECK7: omp.precond.end: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK7-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7129,8 +3788,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7231,8 +3888,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7308,8 +3963,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK7-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7340,8 +3993,6 @@ // CHECK7-NEXT: br label [[DOTEXIT:%.*]] // CHECK7: .exit: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7457,8 +4108,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK7-NEXT: entry: @@ -7559,8 +4208,6 @@ // CHECK7-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK7: .omp.final.done: // CHECK7-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK8-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK8-NEXT: entry: @@ -7599,8 +4246,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -7767,8 +4412,6 @@ // CHECK8: omp.precond.end: // CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -7914,8 +4557,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK8-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -7946,8 +4587,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8092,8 +4731,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8199,8 +4836,6 @@ // CHECK8-NEXT: br label [[OMP_PRECOND_END]] // CHECK8: omp.precond.end: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK8-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8223,8 +4858,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8325,8 +4958,6 @@ // CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK8: .omp.final.done: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8402,8 +5033,6 @@ // CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK8: .omp.final.done: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK8-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8434,8 +5063,6 @@ // CHECK8-NEXT: br label [[DOTEXIT:%.*]] // CHECK8: .exit: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8551,8 +5178,6 @@ // CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK8: .omp.final.done: // CHECK8-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { // CHECK8-NEXT: entry: @@ -8653,8 +5278,6 @@ // CHECK8-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK8: .omp.final.done: // CHECK8-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK9-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -8689,8 +5312,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -8856,8 +5477,6 @@ // CHECK9-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK9-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -8995,8 +5614,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK9-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9025,8 +5642,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9167,8 +5782,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9269,8 +5882,6 @@ // CHECK9-NEXT: br label [[OMP_PRECOND_END]] // CHECK9: omp.precond.end: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK9-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9293,8 +5904,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9393,8 +6002,6 @@ // CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK9: .omp.final.done: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9466,8 +6073,6 @@ // CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK9: .omp.final.done: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK9-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9496,8 +6101,6 @@ // CHECK9-NEXT: br label [[DOTEXIT:%.*]] // CHECK9: .exit: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9609,8 +6212,6 @@ // CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK9: .omp.final.done: // CHECK9-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK9-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK9-NEXT: entry: @@ -9705,8 +6306,6 @@ // CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK9: .omp.final.done: // CHECK9-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK10-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: @@ -9741,8 +6340,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -9908,8 +6505,6 @@ // CHECK10-NEXT: [[TMP57:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK10-NEXT: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[TMP57]]) // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__1 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10047,8 +6642,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 // CHECK10-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10077,8 +6670,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__2 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10219,8 +6810,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10321,8 +6910,6 @@ // CHECK10-NEXT: br label [[OMP_PRECOND_END]] // CHECK10: omp.precond.end: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 // CHECK10-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10345,8 +6932,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__4 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10445,8 +7030,6 @@ // CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK10: .omp.final.done: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__5 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10518,8 +7101,6 @@ // CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK10: .omp.final.done: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 // CHECK10-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10548,8 +7129,6 @@ // CHECK10-NEXT: br label [[DOTEXIT:%.*]] // CHECK10: .exit: // CHECK10-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__6 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10660,9 +7239,7 @@ // CHECK10-NEXT: store i32 10, i32* [[J]], align 4 // CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK10: .omp.final.done: -// CHECK10-NEXT: ret void -// -// +// CHECK10-NEXT: ret void // CHECK10-LABEL: define {{[^@]+}}@__omp_outlined__7 // CHECK10-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { // CHECK10-NEXT: entry: @@ -10757,2098 +7334,5233 @@ // CHECK10-NEXT: br label [[DOTOMP_FINAL_DONE]] // CHECK10: .omp.final.done: // CHECK10-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 // CHECK11-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK11-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK11-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK11-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK11-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK11: cond.true11: +// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK11-NEXT: br label [[COND_END13:%.*]] +// CHECK11: cond.false12: +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END13]] +// CHECK11: cond.end13: +// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK11-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK11-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK11-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK11: omp.dispatch.cond: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK11: omp.dispatch.body: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK11-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK11: omp.dispatch.inc: +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK11: omp.dispatch.end: +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK11-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK11-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK11-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK11-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK11-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK11: .omp.lastprivate.then: +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK11: .omp.lastprivate.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK11-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK11-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK11-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK11-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK11: cond.true10: +// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END12:%.*]] +// CHECK11: cond.false11: +// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END12]] +// CHECK11: cond.end12: +// CHECK11-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK11-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK11-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK11-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK11-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK11-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK11-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK11-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK11-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK11: .execute: +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK11-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK11: .omp.deinit: +// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK11-NEXT: br label [[DOTEXIT:%.*]] +// CHECK11: .exit: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK11-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK11: cond.true5: +// CHECK11-NEXT: br label [[COND_END7:%.*]] +// CHECK11: cond.false6: +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END7]] +// CHECK11: cond.end7: +// CHECK11-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK11-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK11-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK11-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK11-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) // CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 // CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK11-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] // CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK11: .omp.deinit: // CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK11-NEXT: br label [[DOTEXIT:%.*]] // CHECK11: .exit: // CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK11-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK11-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK11-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK11-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK11: cond.true6: +// CHECK11-NEXT: br label [[COND_END8:%.*]] +// CHECK11: cond.false7: +// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END8]] +// CHECK11: cond.end8: +// CHECK11-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK11-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK11-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK11-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK11-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK11-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK11-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK11-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK12-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK12-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK12-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK12-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* +// CHECK12-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK12-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK12: cond.true11: +// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK12-NEXT: br label [[COND_END13:%.*]] +// CHECK12: cond.false12: +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END13]] +// CHECK12: cond.end13: +// CHECK12-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] +// CHECK12-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK12-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK12-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK12-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK12: omp.dispatch.cond: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK12: omp.dispatch.body: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK12-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK12: omp.dispatch.inc: +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK12: omp.dispatch.end: +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK12-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK12-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK12-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK12-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK12-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK12: .omp.lastprivate.then: +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK12: .omp.lastprivate.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 +// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK12-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK12-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK12: cond.true10: +// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: br label [[COND_END12:%.*]] +// CHECK12: cond.false11: +// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END12]] +// CHECK12: cond.end12: +// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK12-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK12-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK12: omp.precond.then: +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK12-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK12-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK12-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK12-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK12-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK12-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK12-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: br label [[OMP_PRECOND_END]] +// CHECK12: omp.precond.end: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 +// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK12-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK12: cond.true5: +// CHECK12-NEXT: br label [[COND_END7:%.*]] +// CHECK12: cond.false6: +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END7]] +// CHECK12: cond.end7: +// CHECK12-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK12-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK12-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK12-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK12-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 +// CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK12: .execute: +// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK12-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK12: .omp.deinit: +// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK12-NEXT: br label [[DOTEXIT:%.*]] +// CHECK12: .exit: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK12: cond.true: +// CHECK12-NEXT: br label [[COND_END:%.*]] +// CHECK12: cond.false: +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END]] +// CHECK12: cond.end: +// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK12-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK12-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK12-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK12-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK12-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK12-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK12-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK12-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK12-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK12-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK12: cond.true6: +// CHECK12-NEXT: br label [[COND_END8:%.*]] +// CHECK12: cond.false7: +// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: br label [[COND_END8]] +// CHECK12: cond.end8: +// CHECK12-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK12-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK12-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK12-NEXT: entry: +// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK12: omp.inner.for.cond: +// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK12: omp.inner.for.body: +// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK12-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK12-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK12-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK12-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK12-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK12-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK12: omp.body.continue: +// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK12: omp.inner.for.inc: +// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK12: omp.inner.for.end: +// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK12: omp.loop.exit: +// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK12: .omp.final.then: +// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK12: .omp.final.done: +// CHECK12-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV2]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i32]* [[TMP0]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK11-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK11-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK11-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK11-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK11-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK11-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK11-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK11-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK11: cond.true11: -// CHECK11-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK11-NEXT: br label [[COND_END13:%.*]] -// CHECK11: cond.false12: -// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END13]] -// CHECK11: cond.end13: -// CHECK11-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK11-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK11-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK11-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 -// CHECK11-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK11-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK11-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[L_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[L2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L2]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB4]], i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV8:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV8]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: [[CONV9:%.*]] = bitcast i64* [[L_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV9]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[L_CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 4 +// CHECK1-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP21]] to i8* +// CHECK1-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i32]*, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i64 5) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] +// CHECK1-NEXT: store i32 [[ADD12]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: br i1 [[CMP13]], label [[COND_TRUE14:%.*]], label [[COND_FALSE15:%.*]] +// CHECK1: cond.true14: +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: br label [[COND_END16:%.*]] +// CHECK1: cond.false15: +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END16]] +// CHECK1: cond.end16: +// CHECK1-NEXT: [[COND17:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE14]] ], [ [[TMP44]], [[COND_FALSE15]] ] +// CHECK1-NEXT: store i32 [[COND17]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) +// CHECK1-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 +// CHECK1-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP50]], 0 +// CHECK1-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV19]], 1 +// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD20]], i32* [[I5]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK1-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP53:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP53]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[L2]]) +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK11-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK11-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK11: omp.dispatch.cond: -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK11: omp.dispatch.body: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK11-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK11-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK11-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK11: omp.dispatch.inc: -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK11-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK11: omp.dispatch.end: -// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK11-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK11-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK11-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK11-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK11-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK11-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK11: .omp.lastprivate.then: -// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 8 +// CHECK1-NEXT: [[L_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[L]], i64* [[L_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK1: omp.dispatch.cond: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP10:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp ugt i64 [[CONV7]], [[TMP10]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CONV9:%.*]] = sext i32 [[TMP12]] to i64 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP11]], [[COND_TRUE]] ], [ [[CONV9]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i64 [[COND]] to i32 +// CHECK1-NEXT: store i32 [[CONV10]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK1: omp.dispatch.body: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[CMP12:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK1-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK1-NEXT: store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK1: omp.dispatch.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK1: omp.dispatch.end: +// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB16:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK1-NEXT: [[DIV17:%.*]] = sdiv i32 [[SUB16]], 1 +// CHECK1-NEXT: [[MUL18:%.*]] = mul nsw i32 [[DIV17]], 1 +// CHECK1-NEXT: [[ADD19:%.*]] = add nsw i32 0, [[MUL18]] +// CHECK1-NEXT: store i32 [[ADD19]], i32* [[I6]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK1: .omp.lastprivate.then: +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[CONV1]], align 8 +// CHECK1-NEXT: store i32 [[TMP33]], i32* [[CONV1]], align 8 +// CHECK1-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK1: .omp.lastprivate.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK11-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV6:%.*]] = bitcast i64* [[N_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP18]], i32* [[CONV6]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to i8* +// CHECK1-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to i8* +// CHECK1-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to i8* +// CHECK1-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP30]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK1-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK1: cond.true11: +// CHECK1-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END13:%.*]] +// CHECK1: cond.false12: +// CHECK1-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END13]] +// CHECK1: cond.end13: +// CHECK1-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ] +// CHECK1-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP41]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP43]]) +// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0 +// CHECK1-NEXT: br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0 +// CHECK1-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK1-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD17]], i32* [[I3]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK11-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK11-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK11-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK11-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK11-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK11: cond.true10: -// CHECK11-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: br label [[COND_END12:%.*]] -// CHECK11: cond.false11: -// CHECK11-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END12]] -// CHECK11: cond.end12: -// CHECK11-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK11-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK11-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK11-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK11-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK11-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I5:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8 +// CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* +// CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP6]] to i32 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[TMP10]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP11]] +// CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[I5]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1 +// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16 +// CHECK1-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX]], align 2 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK1-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1 +// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1 +// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 0, [[MUL14]] +// CHECK1-NEXT: store i32 [[ADD15]], i32* [[I5]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK11-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK11: omp.precond.then: -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK11-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK11-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK11-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK11-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK11-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK11-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK11-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK11-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK11-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK11-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: br label [[OMP_PRECOND_END]] -// CHECK11: omp.precond.end: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37 +// CHECK1-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK11-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP17]], i64 3) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK1-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP24]], 9 +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK1: cond.true5: +// CHECK1-NEXT: br label [[COND_END7:%.*]] +// CHECK1: cond.false6: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END7]] +// CHECK1: cond.end7: +// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP25]], [[COND_FALSE6]] ] +// CHECK1-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +// CHECK1-NEXT: br i1 [[TMP28]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK11-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK11-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK11: cond.true5: -// CHECK11-NEXT: br label [[COND_END7:%.*]] -// CHECK11: cond.false6: -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END7]] -// CHECK11: cond.end7: -// CHECK11-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK11-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK11-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK1-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK1-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK11-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK11-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK11-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK11-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42 +// CHECK1-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP2]], i32* [[CONV1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP3]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK11-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK11-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK11: .execute: -// CHECK11-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK11-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK11-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK11: .omp.deinit: -// CHECK11-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK11-NEXT: br label [[DOTEXIT:%.*]] -// CHECK11: .exit: -// CHECK11-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = bitcast i64* [[F_CASTED]] to i32* +// CHECK1-NEXT: store i32 [[TMP11]], i32* [[CONV3]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i64, i64* [[F_CASTED]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8* +// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK1-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 3 +// CHECK1-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP12]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP19]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, [10 x [10 x i32]]*, i64)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP21]], i64 4) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] +// CHECK1-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP28]], 99 +// CHECK1-NEXT: br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]] +// CHECK1: cond.true7: +// CHECK1-NEXT: br label [[COND_END9:%.*]] +// CHECK1: cond.false8: +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END9]] +// CHECK1: cond.end9: +// CHECK1-NEXT: [[COND10:%.*]] = phi i32 [ 99, [[COND_TRUE7]] ], [ [[TMP29]], [[COND_FALSE8]] ] +// CHECK1-NEXT: store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 8 +// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP2]] to i32 +// CHECK1-NEXT: store i32 [[CONV2]], i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK1-NEXT: [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[DIV5]], 10 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL6]] +// CHECK1-NEXT: [[MUL7:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL7]] +// CHECK1-NEXT: store i32 [[ADD8]], i32* [[J]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP11]], [[MUL9]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD10]], [[TMP14]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK1-NEXT: [[IDXPROM12:%.*]] = sext i32 [[TMP16]] to i64 +// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i64 0, i64 [[IDXPROM12]] +// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4 +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[ADD14]], i32* [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK1-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK11-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK11-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK11: cond.true: -// CHECK11-NEXT: br label [[COND_END:%.*]] -// CHECK11: cond.false: -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END]] -// CHECK11: cond.end: -// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK11-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK11-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK11-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK11-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK11-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK11-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK11-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK11-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK11-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK11: cond.true6: -// CHECK11-NEXT: br label [[COND_END8:%.*]] -// CHECK11: cond.false7: -// CHECK11-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: br label [[COND_END8]] -// CHECK11: cond.end8: -// CHECK11-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK11-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK11-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK11-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK11-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 +// CHECK2-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK2-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK2-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK2: cond.true11: +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK2-NEXT: br label [[COND_END13:%.*]] +// CHECK2: cond.false12: +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END13]] +// CHECK2: cond.end13: +// CHECK2-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK2-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK2-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK2-NEXT: br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0 +// CHECK2-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK2-NEXT: br i1 [[TMP50]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP51:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP51]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK2-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK2: omp.dispatch.cond: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK2: omp.dispatch.body: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK2-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK2: omp.dispatch.inc: +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK2: omp.dispatch.end: +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK2-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK2-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK2-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK2-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK2-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK2: .omp.lastprivate.then: +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK2: .omp.lastprivate.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK11-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK11-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK11-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK11-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK11-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK11-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK11-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK11: omp.inner.for.cond: -// CHECK11-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK11-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK11: omp.inner.for.body: -// CHECK11-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK11-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK11-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK11-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK11-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK11-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK11-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK11-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK11-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK11-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK11-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK11-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK11-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK11: omp.body.continue: -// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK11: omp.inner.for.inc: -// CHECK11-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK11-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK11: omp.inner.for.end: -// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK11: omp.loop.exit: -// CHECK11-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK11: .omp.final.then: -// CHECK11-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK11-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK11: .omp.final.done: -// CHECK11-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK2-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK2-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK2-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK2-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK2-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK2: cond.true10: +// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: br label [[COND_END12:%.*]] +// CHECK2: cond.false11: +// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END12]] +// CHECK2: cond.end12: +// CHECK2-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK2-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK2-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK2-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK2-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK2-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK2: omp.precond.then: +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK2-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK2-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK2-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK2-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK2-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK2-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK2-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: br label [[OMP_PRECOND_END]] +// CHECK2: omp.precond.end: +// CHECK2-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38 -// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37 +// CHECK2-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I4:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK12-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK12-NEXT: [[L1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP3]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] -// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], 1 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP14]], [[ADD]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP18]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[L_CASTED]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP16]] to i8* -// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP27:%.*]] = inttoptr i32 [[TMP19]] to i8* -// CHECK12-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 -// CHECK12-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP29:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 -// CHECK12-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 -// CHECK12-NEXT: [[TMP31:%.*]] = inttoptr i32 [[TMP21]] to i8* -// CHECK12-NEXT: store i8* [[TMP31]], i8** [[TMP30]], align 4 -// CHECK12-NEXT: [[TMP32:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -// CHECK12-NEXT: [[TMP34:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP33]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP34]], i32 5) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP39]], [[TMP40]] -// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]] -// CHECK12-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] -// CHECK12: cond.true11: -// CHECK12-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK12-NEXT: br label [[COND_END13:%.*]] -// CHECK12: cond.false12: -// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END13]] -// CHECK12: cond.end13: -// CHECK12-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP43]], [[COND_TRUE11]] ], [ [[TMP44]], [[COND_FALSE12]] ] -// CHECK12-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP45]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]]) -// CHECK12-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 -// CHECK12-NEXT: br i1 [[TMP49]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP50:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP50]], 0 -// CHECK12-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 -// CHECK12-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: [[TMP51:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -// CHECK12-NEXT: br i1 [[TMP52]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP53:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP53]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK12-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK2: cond.true5: +// CHECK2-NEXT: br label [[COND_END7:%.*]] +// CHECK2: cond.false6: +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END7]] +// CHECK2: cond.end7: +// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK2-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK2-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 -// CHECK12-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) -// CHECK12-NEXT: br label [[OMP_DISPATCH_COND:%.*]] -// CHECK12: omp.dispatch.cond: -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] -// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] -// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] -// CHECK12: omp.dispatch.body: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK12-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] -// CHECK12-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK12-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_DISPATCH_INC:%.*]] -// CHECK12: omp.dispatch.inc: -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK12-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: br label [[OMP_DISPATCH_COND]] -// CHECK12: omp.dispatch.end: -// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) -// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 -// CHECK12-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 -// CHECK12-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 -// CHECK12-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 -// CHECK12-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] -// CHECK12-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 -// CHECK12-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] -// CHECK12: .omp.lastprivate.then: -// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] -// CHECK12: .omp.lastprivate.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK2-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK2-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42 +// CHECK2-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44 -// CHECK12-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK2-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK2: cond.true6: +// CHECK2-NEXT: br label [[COND_END8:%.*]] +// CHECK2: cond.false7: +// CHECK2-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: br label [[COND_END8]] +// CHECK2: cond.end8: +// CHECK2-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK2-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK2-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK2-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK2: omp.inner.for.cond: +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK2: omp.inner.for.body: +// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK2-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK2-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK2-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK2-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK2-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK2-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK2-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK2-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK2: omp.body.continue: +// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK2: omp.inner.for.inc: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK2: omp.inner.for.end: +// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK2: omp.loop.exit: +// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK2-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK2-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK2: .omp.final.then: +// CHECK2-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK2-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK2-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK2: .omp.final.done: +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i32]* [[TMP0]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[L_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[L1:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[L_ON_STACK:%.*]] = bitcast i8* [[L1]] to i32* +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 128) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP18]], i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[L_CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast [1000 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP27]], i8** [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 4 +// CHECK3-NEXT: [[TMP29:%.*]] = inttoptr i32 [[TMP19]] to i8* +// CHECK3-NEXT: store i8* [[TMP29]], i8** [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = bitcast [5 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i32]*, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP32]], i32 5) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]] +// CHECK3-NEXT: br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]] +// CHECK3: cond.true11: +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK3-NEXT: br label [[COND_END13:%.*]] +// CHECK3: cond.false12: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END13]] +// CHECK3: cond.end13: +// CHECK3-NEXT: [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ] +// CHECK3-NEXT: store i32 [[COND14]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP45]]) +// CHECK3-NEXT: [[TMP46:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = icmp ne i32 [[TMP46]], 0 +// CHECK3-NEXT: br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP48:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0 +// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1 +// CHECK3-NEXT: [[ADD17:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD17]], i32* [[I4]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK3-NEXT: br i1 [[TMP50]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP51]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[L1]]) +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__2 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] -// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] -// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* -// CHECK12-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* -// CHECK12-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* -// CHECK12-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK12-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK12-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] -// CHECK12-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] -// CHECK12: cond.true10: -// CHECK12-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: br label [[COND_END12:%.*]] -// CHECK12: cond.false11: -// CHECK12-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END12]] -// CHECK12: cond.end12: -// CHECK12-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] -// CHECK12-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) -// CHECK12-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 -// CHECK12-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 -// CHECK12-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 -// CHECK12-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i32]* nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca [1000 x i32]*, align 4 +// CHECK3-NEXT: [[L_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 32) +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND:%.*]] +// CHECK3: omp.dispatch.cond: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ugt i32 [[TMP9]], [[TMP10]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]] +// CHECK3: omp.dispatch.body: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* [[TMP0]], i32 0, i32 [[TMP19]] +// CHECK3-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] +// CHECK3: omp.dispatch.inc: +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[OMP_DISPATCH_COND]] +// CHECK3: omp.dispatch.end: +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP27]]) +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB10:%.*]] = sub nsw i32 [[TMP30]], 0 +// CHECK3-NEXT: [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1 +// CHECK3-NEXT: [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1 +// CHECK3-NEXT: [[ADD13:%.*]] = add nsw i32 0, [[MUL12]] +// CHECK3-NEXT: store i32 [[ADD13]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK3-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// CHECK3: .omp.lastprivate.then: +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP33]], i32* [[L_ADDR]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] +// CHECK3: .omp.lastprivate.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 -// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 -// CHECK12-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] -// CHECK12: omp.precond.then: -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 -// CHECK12-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] -// CHECK12-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] -// CHECK12-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -// CHECK12-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 -// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 -// CHECK12-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 -// CHECK12-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] -// CHECK12-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 -// CHECK12-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 -// CHECK12-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 -// CHECK12-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 -// CHECK12-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] -// CHECK12-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: br label [[OMP_PRECOND_END]] -// CHECK12: omp.precond.end: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49 -// CHECK12-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__2 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to i8* +// CHECK3-NEXT: store i8* [[TMP19]], i8** [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to i8* +// CHECK3-NEXT: store i8* [[TMP21]], i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to i8* +// CHECK3-NEXT: store i8* [[TMP23]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast [1000 x i16]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP25]], i8** [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP27]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32, [1000 x i16]*)* @__omp_outlined__3 to i8*), i8* null, i8** [[TMP28]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK3-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]] +// CHECK3: cond.true10: +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END12:%.*]] +// CHECK3: cond.false11: +// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END12]] +// CHECK3: cond.end12: +// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ] +// CHECK3-NEXT: store i32 [[COND13]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP39]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]]) +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 +// CHECK3-NEXT: br i1 [[TMP43]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP44]], 0 +// CHECK3-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1 +// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD16]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__4 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 -// CHECK12-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK12-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] -// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 -// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] -// CHECK12: cond.true5: -// CHECK12-NEXT: br label [[COND_END7:%.*]] -// CHECK12: cond.false6: -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END7]] -// CHECK12: cond.end7: -// CHECK12-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] -// CHECK12-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 -// CHECK12-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I3]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[I3]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], [1000 x i16]* [[TMP0]], i32 0, i32 [[TMP13]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1 +// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16 +// CHECK3-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX]], align 2 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP18]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK3-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK3-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK3-NEXT: store i32 [[ADD11]], i32* [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__5 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -// CHECK12-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 -// CHECK12-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] -// CHECK12-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -// CHECK12-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37 +// CHECK3-SAME: ([10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__4 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10 +// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x i32]*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP15]], i32 3) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9 +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]] +// CHECK3: cond.true5: +// CHECK3-NEXT: br label [[COND_END7:%.*]] +// CHECK3: cond.false6: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END7]] +// CHECK3: cond.end7: +// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ] +// CHECK3-NEXT: store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0 +// CHECK3-NEXT: br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 [[TMP9]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK3-NEXT: store i32 [[ADD1]], i32* [[ARRAYIDX]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l54 -// CHECK12-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) -// CHECK12-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK12: .execute: -// CHECK12-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK12-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] -// CHECK12-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK12: .omp.deinit: -// CHECK12-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK12-NEXT: br label [[DOTEXIT:%.*]] -// CHECK12: .exit: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42 +// CHECK3-SAME: ([10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 0) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP3]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__6 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK12-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) -// CHECK12-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 -// CHECK12-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK12: cond.true: -// CHECK12-NEXT: br label [[COND_END:%.*]] -// CHECK12: cond.false: -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END]] -// CHECK12: cond.end: -// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] -// CHECK12-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 -// CHECK12-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK12-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* -// CHECK12-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK12-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* -// CHECK12-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 -// CHECK12-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* -// CHECK12-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 -// CHECK12-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 -// CHECK12-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* -// CHECK12-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 -// CHECK12-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK12-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] -// CHECK12-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK12-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 -// CHECK12-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] -// CHECK12: cond.true6: -// CHECK12-NEXT: br label [[COND_END8:%.*]] -// CHECK12: cond.false7: -// CHECK12-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: br label [[COND_END8]] -// CHECK12: cond.end8: -// CHECK12-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] -// CHECK12-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 -// CHECK12-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) -// CHECK12-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 -// CHECK12-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__6 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99 +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100 +// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP9]], i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[F_CASTED]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8* +// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast [10 x [10 x i32]]* [[TMP0]] to i8* +// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 3 +// CHECK3-NEXT: [[TMP18:%.*]] = inttoptr i32 [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP18]], i8** [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, [10 x [10 x i32]]*, i32)* @__omp_outlined__7 to i8*), i8* null, i8** [[TMP19]], i32 4) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]] +// CHECK3-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99 +// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]] +// CHECK3: cond.true6: +// CHECK3-NEXT: br label [[COND_END8:%.*]] +// CHECK3: cond.false7: +// CHECK3-NEXT: [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END8]] +// CHECK3: cond.end8: +// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ] +// CHECK3-NEXT: store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +// CHECK3-NEXT: br i1 [[TMP30]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void // // -// CHECK12-LABEL: define {{[^@]+}}@__omp_outlined__7 -// CHECK12-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { -// CHECK12-NEXT: entry: -// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 -// CHECK12-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[K:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: [[J:%.*]] = alloca i32, align 4 -// CHECK12-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 -// CHECK12-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 -// CHECK12-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -// CHECK12-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) -// CHECK12-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 -// CHECK12-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// CHECK12: omp.inner.for.cond: -// CHECK12-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 -// CHECK12-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] -// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// CHECK12: omp.inner.for.body: -// CHECK12-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 -// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 -// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK12-NEXT: store i32 [[ADD]], i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 -// CHECK12-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 -// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] -// CHECK12-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 -// CHECK12-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] -// CHECK12-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 -// CHECK12-NEXT: store i32 10, i32* [[K]], align 4 -// CHECK12-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 -// CHECK12-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 -// CHECK12-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 -// CHECK12-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] -// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] -// CHECK12-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 -// CHECK12-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] -// CHECK12-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 -// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] -// CHECK12-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 -// CHECK12-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] -// CHECK12-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 -// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] -// CHECK12: omp.body.continue: -// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// CHECK12: omp.inner.for.inc: -// CHECK12-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 -// CHECK12-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] -// CHECK12-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 -// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] -// CHECK12: omp.inner.for.end: -// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]] -// CHECK12: omp.loop.exit: -// CHECK12-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) -// CHECK12-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK12-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 -// CHECK12-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] -// CHECK12: .omp.final.then: -// CHECK12-NEXT: store i32 10, i32* [[I]], align 4 -// CHECK12-NEXT: store i32 10, i32* [[J]], align 4 -// CHECK12-NEXT: br label [[DOTOMP_FINAL_DONE]] -// CHECK12: .omp.final.done: -// CHECK12-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__7 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], [10 x [10 x i32]]* nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [10 x [10 x i32]]*, align 4 +// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 99, i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10 +// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]] +// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1 +// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]] +// CHECK3-NEXT: store i32 [[ADD5]], i32* [[J]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[K]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[F_ADDR]], align 4 +// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]] +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[K]], align 4 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* [[TMP0]], i32 0, i32 [[TMP15]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, i32* [[J]], align 4 +// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX]], i32 0, i32 [[TMP16]] +// CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP4]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: store i32 10, i32* [[I]], align 4 +// CHECK3-NEXT: store i32 10, i32* [[J]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -2,10 +2,8 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK4 +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -42,11 +40,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK5 -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK3 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK7 -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK8 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK4 // expected-no-diagnostics #ifdef CK2 @@ -80,828 +76,6 @@ #endif // CK2 #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 -// CHECK1-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK1-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 -// CHECK1-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: store i8** null, i8*** [[TMP0]], align 8 -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 -// CHECK2-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8 -// CHECK2-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 -// CHECK2-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK2-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 -// CHECK2-NEXT: store i8** null, i8*** [[TMP0]], align 8 -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker -// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 -// CHECK3-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK3-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 -// CHECK3-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 -// CHECK3-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 -// CHECK3-NEXT: store i8** null, i8*** [[TMP0]], align 4 -// CHECK3-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker -// CHECK4-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK4: .await.work: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK4: .select.workers: -// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK4: .execute.parallel: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK4: .terminate.parallel: -// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK4: .barrier.parallel: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25 -// CHECK4-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK4: .worker: -// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_worker() #[[ATTR3:[0-9]+]] -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] -// CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK4: .termination.notifier: -// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTEXIT]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker -// CHECK4-SAME: () #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK4: .await.work: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK4: .select.workers: -// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK4: .execute.parallel: -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK4: .terminate.parallel: -// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK4: .barrier.parallel: -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17 -// CHECK4-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 -// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK4-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK4: .worker: -// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l17_worker() #[[ATTR3]] -// CHECK4-NEXT: br label [[DOTEXIT:%.*]] -// CHECK4: .mastercheck: -// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK4: .master: -// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK4-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK4-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK4-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] -// CHECK4-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK4: .termination.notifier: -// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK4-NEXT: br label [[DOTEXIT]] -// CHECK4: .exit: -// CHECK4-NEXT: ret void -// -// -// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 -// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK4-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: store i8** null, i8*** [[TMP0]], align 4 -// CHECK4-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -935,8 +109,6 @@ // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK5-NEXT: entry: @@ -998,8 +170,6 @@ // CHECK5-NEXT: br label [[DOTEXIT]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: @@ -1012,8 +182,6 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 // CHECK5-NEXT: store i32 0, i32* [[TMP0]], align 4 // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker // CHECK5-SAME: () #[[ATTR0]] { // CHECK5-NEXT: entry: @@ -1047,8 +215,6 @@ // CHECK5-NEXT: br label [[DOTAWAIT_WORK]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 // CHECK5-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: @@ -1109,8 +275,6 @@ // CHECK5-NEXT: br label [[DOTEXIT]] // CHECK5: .exit: // CHECK5-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: @@ -1123,643 +287,1411 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 // CHECK5-NEXT: store i8** null, i8*** [[TMP0]], align 8 // CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 +// CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* +// CHECK6-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV2]], align 8 +// CHECK6-NEXT: [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store i32 [[TMP7]], i32* [[ARGC9]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]] +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 +// CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK6-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK6-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store i8** [[TMP7]], i8*** [[ARGC8]], align 8 +// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]] +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK6-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 +// CHECK6-NEXT: store i8** null, i8*** [[TMP0]], align 8 +// CHECK6-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker +// CHECK7-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK7-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK7-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK7-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK7-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK7: .await.work: +// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK7-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK7-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK7-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK7-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK7-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK7-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK7: .select.workers: +// CHECK7-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK7-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK7-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK7: .execute.parallel: +// CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK7-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK7-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK7: .terminate.parallel: +// CHECK7-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK7-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK7: .barrier.parallel: +// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK7-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 +// CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK7-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK7-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK7-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK7: .worker: +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .mastercheck: +// CHECK7-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK7-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK7-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK7-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK7-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK7-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK7-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK7-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK7: .master: +// CHECK7-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK7-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK7-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK7-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK7-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 +// CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK7-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 +// CHECK7-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* +// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 +// CHECK7-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] +// CHECK7-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 +// CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) +// CHECK7-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK7: .termination.notifier: +// CHECK7-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK7-NEXT: br label [[DOTEXIT]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker +// CHECK7-SAME: () #[[ATTR0]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK7-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK7-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK7-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK7-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK7: .await.work: +// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK7-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK7-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK7-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK7-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK7-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK7-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK7: .select.workers: +// CHECK7-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK7-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK7-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK7: .execute.parallel: +// CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK7-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK7-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK7: .terminate.parallel: +// CHECK7-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK7-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK7: .barrier.parallel: +// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK7-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 +// CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 +// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK7-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK7-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK7-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK7-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK7-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK7: .worker: +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] +// CHECK7-NEXT: br label [[DOTEXIT:%.*]] +// CHECK7: .mastercheck: +// CHECK7-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK7-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK7-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK7-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK7-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK7-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK7-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK7-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK7: .master: +// CHECK7-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK7-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK7-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK7-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK7-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK7-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 +// CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) +// CHECK7-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 +// CHECK7-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 +// CHECK7-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* +// CHECK7-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 +// CHECK7-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 4 +// CHECK7-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK7-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] +// CHECK7-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 +// CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) +// CHECK7-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK7: .termination.notifier: +// CHECK7-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK7-NEXT: br label [[DOTEXIT]] +// CHECK7: .exit: +// CHECK7-NEXT: ret void +// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 +// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK7-NEXT: entry: +// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 +// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK7-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 +// CHECK7-NEXT: store i8** null, i8*** [[TMP0]], align 4 +// CHECK7-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker +// CHECK8-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK8-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK8-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK8-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK8-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK8: .await.work: +// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK8-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK8-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK8-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK8-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK8-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK8-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK8: .select.workers: +// CHECK8-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK8-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK8-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK8: .execute.parallel: +// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK8-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK8-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK8: .terminate.parallel: +// CHECK8-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK8-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK8: .barrier.parallel: +// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK8-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 +// CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK8-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK8-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK8-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK8: .worker: +// CHECK8-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .mastercheck: +// CHECK8-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK8-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK8-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK8-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK8-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK8-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK8-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK8-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK8: .master: +// CHECK8-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK8-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK8-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK8-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK8-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK8-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK8-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] +// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK8-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK8: .termination.notifier: +// CHECK8-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK8-NEXT: br label [[DOTEXIT]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker +// CHECK8-SAME: () #[[ATTR0]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK8-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK8-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK8-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK8-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK8: .await.work: +// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK8-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK8-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK8-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK8-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK8-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK8-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK8: .select.workers: +// CHECK8-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK8-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK8-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK8: .execute.parallel: +// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK8-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK8-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK8: .terminate.parallel: +// CHECK8-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK8-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK8: .barrier.parallel: +// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK8-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 +// CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 +// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK8-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK8-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK8-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK8-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK8-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK8: .worker: +// CHECK8-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] +// CHECK8-NEXT: br label [[DOTEXIT:%.*]] +// CHECK8: .mastercheck: +// CHECK8-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK8-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK8-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK8-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK8-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK8-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK8-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK8-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK8: .master: +// CHECK8-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK8-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK8-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK8-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK8-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK8-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) +// CHECK8-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK8-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK8-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 4 +// CHECK8-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK8-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] +// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK8-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK8: .termination.notifier: +// CHECK8-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK8-NEXT: br label [[DOTEXIT]] +// CHECK8: .exit: +// CHECK8-NEXT: ret void +// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK8-NEXT: entry: +// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 +// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK8-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 +// CHECK8-NEXT: store i8** null, i8*** [[TMP0]], align 4 +// CHECK8-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker -// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK1-SAME: (i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 +// CHECK1-NEXT: [[ARGC7:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC7]] to i32* +// CHECK1-NEXT: store i32 [[TMP5]], i32* [[ARGC_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[ARGC7]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK6-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 -// CHECK6-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* -// CHECK6-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 1) -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV2]], align 8 -// CHECK6-NEXT: [[ARGC9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: store i32 [[TMP7]], i32* [[ARGC9]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC9]]) #[[ATTR3]] -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK6-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker -// CHECK6-SAME: () #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15 +// CHECK1-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[ARGC7:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK1-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC7]] to i8*** +// CHECK1-NEXT: store i8** [[TMP5]], i8*** [[ARGC_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[ARGC7]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 -// CHECK6-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK6-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 -// CHECK6-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: [[ARGC8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: store i8** [[TMP7]], i8*** [[ARGC8]], align 8 -// CHECK6-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC8]]) #[[ATTR3]] -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 +// CHECK1-NEXT: store i8** null, i8*** [[TMP0]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 +// CHECK2-SAME: (i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[ARGC7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC7]] to i32* +// CHECK2-NEXT: store i32 [[TMP5]], i32* [[ARGC_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[ARGC7]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK6-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK6-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 -// CHECK6-NEXT: store i8** null, i8*** [[TMP0]], align 8 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker -// CHECK7-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK7-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK7-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK7-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK7-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK7: .await.work: -// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK7-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK7-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK7-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK7-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK7-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK7-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK7: .select.workers: -// CHECK7-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK7-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK7-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK7: .execute.parallel: -// CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK7-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK7-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK7: .terminate.parallel: -// CHECK7-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK7-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK7: .barrier.parallel: -// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK7-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK7-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK7-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK7-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK7: .worker: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .mastercheck: -// CHECK7-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK7-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK7-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK7-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK7-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK7-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK7-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK7-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK7: .master: -// CHECK7-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK7-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK7-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK7-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK7-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK7-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK7-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK7-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK7-NEXT: store i32 [[TMP10]], i32* [[ARGC7]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] -// CHECK7-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK7-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK7: .termination.notifier: -// CHECK7-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK7-NEXT: br label [[DOTEXIT]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15 +// CHECK2-SAME: (i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[ARGC7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC7]] to i8*** +// CHECK2-NEXT: store i8** [[TMP5]], i8*** [[ARGC_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[ARGC7]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK7-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 +// CHECK2-NEXT: store i8** null, i8*** [[TMP0]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker -// CHECK7-SAME: () #[[ATTR0]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK7-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK7-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK7-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK7-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK7: .await.work: -// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK7-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK7-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK7-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK7-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK7-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK7-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK7: .select.workers: -// CHECK7-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK7-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK7-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK7: .execute.parallel: -// CHECK7-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK7-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK7-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK7-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK7: .terminate.parallel: -// CHECK7-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK7-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK7: .barrier.parallel: -// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK7-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64 +// CHECK3-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i64 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* +// CHECK3-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_worker() #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV2]], align 8 +// CHECK3-NEXT: [[ARGC9:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK3-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC9]] to i32* +// CHECK3-NEXT: store i32 [[TMP5]], i32* [[ARGC_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[ARGC9]]) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 -// CHECK7-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 -// CHECK7-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK7-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK7-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK7-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK7-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK7-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK7-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK7-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK7: .worker: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] -// CHECK7-NEXT: br label [[DOTEXIT:%.*]] -// CHECK7: .mastercheck: -// CHECK7-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK7-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK7-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK7-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK7-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK7-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK7-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK7-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK7: .master: -// CHECK7-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK7-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK7-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK7-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK7-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK7-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK7-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 -// CHECK7-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK7-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK7-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK7-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.0* -// CHECK7-NEXT: [[TMP10:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP9]], i32 0, i32 0 -// CHECK7-NEXT: store i8** [[TMP10]], i8*** [[ARGC7]], align 4 -// CHECK7-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK7-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK7-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] -// CHECK7-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK7-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK7-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK7: .termination.notifier: -// CHECK7-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK7-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK7-NEXT: br label [[DOTEXIT]] -// CHECK7: .exit: -// CHECK7-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK3-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK7-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK7-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 -// CHECK7-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK7-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK7-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 -// CHECK7-NEXT: store i8** null, i8*** [[TMP0]], align 4 -// CHECK7-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker -// CHECK8-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK8-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK8-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK8-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK8-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK8: .await.work: -// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK8-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK8-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK8-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK8-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK8-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK8-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK8: .select.workers: -// CHECK8-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK8-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK8-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK8: .execute.parallel: -// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK8-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK8-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK8: .terminate.parallel: -// CHECK8-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK8-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK8: .barrier.parallel: -// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK8-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53 +// CHECK3-SAME: (i64 [[A:%.*]], i64 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK3-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK3-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[ARGC8:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK3-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC8]] to i8*** +// CHECK3-NEXT: store i8** [[TMP5]], i8*** [[ARGC_ON_STACK]], align 8 +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[ARGC8]]) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK8-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK8-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK8-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK8-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK8-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK8: .worker: -// CHECK8-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68_worker() #[[ATTR3:[0-9]+]] -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .mastercheck: -// CHECK8-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK8-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK8-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK8-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK8-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK8-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK8-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK8-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK8: .master: -// CHECK8-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK8-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK8-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK8-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK8-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK8-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK8-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK8-NEXT: store i32 [[TMP7]], i32* [[ARGC7]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC7]]) #[[ATTR3]] -// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK8-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK8: .termination.notifier: -// CHECK8-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK8-NEXT: br label [[DOTEXIT]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK3-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 8 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK3-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 8 +// CHECK3-NEXT: store i8** null, i8*** [[TMP0]], align 8 +// CHECK3-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_worker +// CHECK4-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK8-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: store i32 0, i32* [[TMP0]], align 4 -// CHECK8-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64 +// CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_worker() #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[ARGC7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK4-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC7]] to i32* +// CHECK4-NEXT: store i32 [[TMP5]], i32* [[ARGC_ON_STACK]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[ARGC7]]) +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker -// CHECK8-SAME: () #[[ATTR0]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK8-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK8-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK8-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK8-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK8: .await.work: -// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK8-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK8-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK8-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK8-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK8-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK8-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK8: .select.workers: -// CHECK8-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK8-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK8-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK8: .execute.parallel: -// CHECK8-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK8-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK8-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK8: .terminate.parallel: -// CHECK8-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK8-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK8: .barrier.parallel: -// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK8-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[ARGC]], i32** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store i32 0, i32* [[TMP0]], align 4 +// CHECK4-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57 -// CHECK8-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 -// CHECK8-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK8-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK8-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK8-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK8-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK8-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK8-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK8-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK8-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK8: .worker: -// CHECK8-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l57_worker() #[[ATTR3]] -// CHECK8-NEXT: br label [[DOTEXIT:%.*]] -// CHECK8: .mastercheck: -// CHECK8-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK8-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK8-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK8-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK8-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK8-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK8-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK8-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK8: .master: -// CHECK8-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK8-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK8-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK8-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK8-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK8-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 4, i16 1) -// CHECK8-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK8-NEXT: [[TMP7:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: [[ARGC7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK8-NEXT: store i8** [[TMP7]], i8*** [[ARGC7]], align 4 -// CHECK8-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK8-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK8-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC7]]) #[[ATTR3]] -// CHECK8-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK8-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK8: .termination.notifier: -// CHECK8-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK8-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK8-NEXT: br label [[DOTEXIT]] -// CHECK8: .exit: -// CHECK8-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_worker +// CHECK4-SAME: () #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK4-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK4: .await.work: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK4-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK4-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK4-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK4-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK4: .select.workers: +// CHECK4-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK4-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK4-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK4: .execute.parallel: +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK4-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK4-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK4: .terminate.parallel: +// CHECK4-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK4-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK4: .barrier.parallel: +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK8-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK8-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK8-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 -// CHECK8-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK8-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK8-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 -// CHECK8-NEXT: store i8** null, i8*** [[TMP0]], align 4 -// CHECK8-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53 +// CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i8** [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i8**, align 4 +// CHECK4-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK4-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK4-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK4-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK4: .worker: +// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_worker() #[[ATTR3]] +// CHECK4-NEXT: br label [[DOTEXIT:%.*]] +// CHECK4: .mastercheck: +// CHECK4-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK4-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK4-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK4-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK4-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK4: .master: +// CHECK4-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK4-NEXT: [[TMP5:%.*]] = load i8**, i8*** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[ARGC7:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK4-NEXT: [[ARGC_ON_STACK:%.*]] = bitcast i8* [[ARGC7]] to i8*** +// CHECK4-NEXT: store i8** [[TMP5]], i8*** [[ARGC_ON_STACK]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK4-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR3]] +// CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[ARGC7]]) +// CHECK4-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK4: .termination.notifier: +// CHECK4-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK4-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK4-NEXT: br label [[DOTEXIT]] +// CHECK4: .exit: +// CHECK4-NEXT: ret void +// +// +// CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8*** nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK4-NEXT: [[ARGC_ADDR:%.*]] = alloca i8***, align 4 +// CHECK4-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK4-NEXT: store i8*** [[ARGC]], i8**** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load i8***, i8**** [[ARGC_ADDR]], align 4 +// CHECK4-NEXT: store i8** null, i8*** [[TMP0]], align 4 +// CHECK4-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -2,12 +2,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK4 -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK5 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK6 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -53,4354 +50,6 @@ } #endif -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK1-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty* -// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[CONV]], align 8 -// CHECK1-NEXT: [[E7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store double [[TMP10]], double* [[E7]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i32 [[TMP11]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP12]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 -// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.0* -// CHECK1-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP3]], i32 0, i32 0 -// CHECK1-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load double, double* [[E1]], align 8 -// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP4]], 5.000000e+00 -// CHECK1-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[E1]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 -// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP13:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = load double, double* [[E1]], align 8 -// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP13]], [[TMP14]] -// CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK1-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 -// CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK1-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK1-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK1-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK1-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK1-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK1-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 -// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 -// CHECK1-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK1-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK1-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK1-NEXT: br label [[PRECOND:%.*]] -// CHECK1: precond: -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK1-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK1: body: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 -// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK1-NEXT: br label [[PRECOND]] -// CHECK1: exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK1: .await.work: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK1: .select.workers: -// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK1: .terminate.parallel: -// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK1: .barrier.parallel: -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK1-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .mastercheck: -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK1: .master: -// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* @"_openmp_static_kernel$size2", align 8 -// CHECK1-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.2* -// CHECK1-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 8 -// CHECK1-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP9]], i32 0, i32 1 -// CHECK1-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 -// CHECK1-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 8 -// CHECK1-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP9]], i32 0, i32 0 -// CHECK1-NEXT: store float [[TMP11]], float* [[D9]], align 4 -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK1-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) -// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK1: .termination.notifier: -// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK1-NEXT: br label [[DOTEXIT]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 -// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK1-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK1-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK1-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 -// CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 -// CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 -// CHECK1-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: store i8* [[C1]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* -// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 1024, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) -// CHECK1-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 -// CHECK1-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK1-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 -// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] -// CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK1-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 -// CHECK1-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK1-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 -// CHECK1-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 -// CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK1-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK1-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK1-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK1-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK1-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK1: then6: -// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 -// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 -// CHECK1-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK1-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 -// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 -// CHECK1-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK1-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK1-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK1-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK1-NEXT: br label [[IFCONT8:%.*]] -// CHECK1: else7: -// CHECK1-NEXT: br label [[IFCONT8]] -// CHECK1: ifcont8: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK1-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK1-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK1-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK1: then8: -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK1-NEXT: br label [[IFCONT10:%.*]] -// CHECK1: else9: -// CHECK1-NEXT: br label [[IFCONT10]] -// CHECK1: ifcont10: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK1: then12: -// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK1-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK1-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK1-NEXT: br label [[IFCONT14:%.*]] -// CHECK1: else13: -// CHECK1-NEXT: br label [[IFCONT14]] -// CHECK1: ifcont14: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK1-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 -// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 -// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK1: .execute: -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK1: .omp.deinit: -// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK1-NEXT: br label [[DOTEXIT:%.*]] -// CHECK1: .exit: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i64 2) -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) -// CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__12 -// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK1: cond.true: -// CHECK1-NEXT: br label [[COND_END:%.*]] -// CHECK1: cond.false: -// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK1-NEXT: br label [[COND_END]] -// CHECK1: cond.end: -// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) -// CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK1: .omp.reduction.then: -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK1: cond.true9: -// CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK1-NEXT: br label [[COND_END11:%.*]] -// CHECK1: cond.false10: -// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK1-NEXT: br label [[COND_END11]] -// CHECK1: cond.end11: -// CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK1: .omp.reduction.done: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK1: then6: -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK1-NEXT: br label [[IFCONT8:%.*]] -// CHECK1: else7: -// CHECK1-NEXT: br label [[IFCONT8]] -// CHECK1: ifcont8: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK1: then8: -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK1-NEXT: br label [[IFCONT10:%.*]] -// CHECK1: else9: -// CHECK1-NEXT: br label [[IFCONT10]] -// CHECK1: ifcont10: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK1: then12: -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK1-NEXT: br label [[IFCONT14:%.*]] -// CHECK1: else13: -// CHECK1-NEXT: br label [[IFCONT14]] -// CHECK1: ifcont14: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK1: then6: -// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK1-NEXT: br label [[IFCONT8:%.*]] -// CHECK1: else7: -// CHECK1-NEXT: br label [[IFCONT8]] -// CHECK1: ifcont8: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK1: then: -// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK1-NEXT: br label [[IFCONT:%.*]] -// CHECK1: else: -// CHECK1-NEXT: br label [[IFCONT]] -// CHECK1: ifcont: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK1: then4: -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK1-NEXT: br label [[IFCONT6:%.*]] -// CHECK1: else5: -// CHECK1-NEXT: br label [[IFCONT6]] -// CHECK1: ifcont6: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK1: then8: -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK1-NEXT: br label [[IFCONT10:%.*]] -// CHECK1: else9: -// CHECK1-NEXT: br label [[IFCONT10]] -// CHECK1: ifcont10: -// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK1: then12: -// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK1-NEXT: br label [[IFCONT14:%.*]] -// CHECK1: else13: -// CHECK1-NEXT: br label [[IFCONT14]] -// CHECK1: ifcont14: -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK1-NEXT: ret void -// -// -// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 -// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK1-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK2-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* -// CHECK2-NEXT: [[TMP7:%.*]] = load double, double* [[CONV]], align 8 -// CHECK2-NEXT: [[E7:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i32 [[TMP8]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty.0* -// CHECK2-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP2]], i32 0, i32 0 -// CHECK2-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 -// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 -// CHECK2-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* -// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK2-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 -// CHECK2-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 -// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] -// CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK2-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 -// CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK2-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK2-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK2-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK2-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK2-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK2-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 -// CHECK2-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 -// CHECK2-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK2-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK2-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK2-NEXT: br label [[PRECOND:%.*]] -// CHECK2: precond: -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK2-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK2: body: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK2-NEXT: br label [[PRECOND]] -// CHECK2: exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK2-SAME: () #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK2: .await.work: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 -// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK2: .select.workers: -// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK2: .terminate.parallel: -// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK2: .barrier.parallel: -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK2-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .mastercheck: -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK2: .master: -// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.2* -// CHECK2-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 8 -// CHECK2-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 8 -// CHECK2-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: store float [[TMP8]], float* [[D9]], align 4 -// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK2: .termination.notifier: -// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK2-NEXT: br label [[DOTEXIT]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 -// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 -// CHECK2-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 1) -// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.3* -// CHECK2-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP3]], i32 0, i32 1 -// CHECK2-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP3]], i32 0, i32 0 -// CHECK2-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK2-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 -// CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK2-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 -// CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 -// CHECK2-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: store i8* [[C1]], i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) -// CHECK2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK2-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 -// CHECK2-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 -// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK2-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 -// CHECK2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK2-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 -// CHECK2-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 -// CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK2-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK2-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK2-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK2-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK2-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK2: then6: -// CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 -// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 -// CHECK2-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK2-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 -// CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 -// CHECK2-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK2-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK2-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK2-NEXT: br label [[IFCONT8:%.*]] -// CHECK2: else7: -// CHECK2-NEXT: br label [[IFCONT8]] -// CHECK2: ifcont8: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK2-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK2-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK2-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK2: then8: -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK2-NEXT: br label [[IFCONT10:%.*]] -// CHECK2: else9: -// CHECK2-NEXT: br label [[IFCONT10]] -// CHECK2: ifcont10: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK2: then12: -// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK2-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK2-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK2-NEXT: br label [[IFCONT14:%.*]] -// CHECK2: else13: -// CHECK2-NEXT: br label [[IFCONT14]] -// CHECK2: ifcont14: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK2-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 -// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 -// CHECK2-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 -// CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK2: .execute: -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK2: .omp.deinit: -// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK2-NEXT: br label [[DOTEXIT:%.*]] -// CHECK2: .exit: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2) -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) -// CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 -// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 -// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 -// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 -// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK2: cond.true: -// CHECK2-NEXT: br label [[COND_END:%.*]] -// CHECK2: cond.false: -// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK2-NEXT: br label [[COND_END]] -// CHECK2: cond.end: -// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) -// CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK2: .omp.reduction.then: -// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK2: cond.true9: -// CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK2-NEXT: br label [[COND_END11:%.*]] -// CHECK2: cond.false10: -// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK2-NEXT: br label [[COND_END11]] -// CHECK2: cond.end11: -// CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK2: .omp.reduction.done: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK2: then6: -// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK2-NEXT: br label [[IFCONT8:%.*]] -// CHECK2: else7: -// CHECK2-NEXT: br label [[IFCONT8]] -// CHECK2: ifcont8: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK2: then8: -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK2-NEXT: br label [[IFCONT10:%.*]] -// CHECK2: else9: -// CHECK2-NEXT: br label [[IFCONT10]] -// CHECK2: ifcont10: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK2: then12: -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK2-NEXT: br label [[IFCONT14:%.*]] -// CHECK2: else13: -// CHECK2-NEXT: br label [[IFCONT14]] -// CHECK2: ifcont14: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 -// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 -// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 -// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK2: then6: -// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 -// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 -// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 -// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 -// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK2-NEXT: br label [[IFCONT8:%.*]] -// CHECK2: else7: -// CHECK2-NEXT: br label [[IFCONT8]] -// CHECK2: ifcont8: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK2: then: -// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 -// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK2-NEXT: br label [[IFCONT:%.*]] -// CHECK2: else: -// CHECK2-NEXT: br label [[IFCONT]] -// CHECK2: ifcont: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK2: then4: -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK2-NEXT: br label [[IFCONT6:%.*]] -// CHECK2: else5: -// CHECK2-NEXT: br label [[IFCONT6]] -// CHECK2: ifcont6: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK2: then8: -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK2-NEXT: br label [[IFCONT10:%.*]] -// CHECK2: else9: -// CHECK2-NEXT: br label [[IFCONT10]] -// CHECK2: ifcont10: -// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK2: then12: -// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 -// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK2-NEXT: br label [[IFCONT14:%.*]] -// CHECK2: else13: -// CHECK2-NEXT: br label [[IFCONT14]] -// CHECK2: ifcont14: -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 -// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 -// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 -// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP6]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK2-NEXT: ret void -// -// -// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 -// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 -// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 -// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 -// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.5* -// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5:%.*]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 0 -// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 -// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_5]], %struct._globalized_locals_ty.5* [[TMP4]], i32 0, i32 1 -// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK2-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK3-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK3-NEXT: [[E7:%.*]] = alloca double, align 8 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK3-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty* -// CHECK3-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK3-NEXT: [[TMP6:%.*]] = load double, double* [[E1]], align 8 -// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00 -// CHECK3-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast double* [[E1]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK3-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK3-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP15:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK3-NEXT: [[TMP16:%.*]] = load double, double* [[E1]], align 8 -// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]] -// CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK3-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK3-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK3-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK3-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK3-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK3-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK3-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK3-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK3-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK3-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK3-NEXT: br label [[PRECOND:%.*]] -// CHECK3: precond: -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK3-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK3: body: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK3-NEXT: br label [[PRECOND]] -// CHECK3: exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK3: .await.work: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK3: .select.workers: -// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK3: .terminate.parallel: -// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK3: .barrier.parallel: -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK3-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .mastercheck: -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK3: .master: -// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4 -// CHECK3-NEXT: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) -// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1* -// CHECK3-NEXT: [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK3-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1 -// CHECK3-NEXT: store i8 [[TMP10]], i8* [[C8]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK3-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0 -// CHECK3-NEXT: store float [[TMP11]], float* [[D9]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2 -// CHECK3-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]]) -// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK3: .termination.notifier: -// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK3-NEXT: br label [[DOTEXIT]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__3 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2* -// CHECK3-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK3-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP5]] to i32 -// CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load float, float* [[D2]], align 4 -// CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01 -// CHECK3-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: store i8* [[C1]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast float* [[D2]] to i8* -// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 1024, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10) -// CHECK3-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1 -// CHECK3-NEXT: br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 -// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK3-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load float, float* [[D2]], align 4 -// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]] -// CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK3-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK3-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK3-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK3-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK3-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK3-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK3-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK3-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK3-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK3: then6: -// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK3-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK3-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK3-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK3-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK3-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK3-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK3-NEXT: br label [[IFCONT8:%.*]] -// CHECK3: else7: -// CHECK3-NEXT: br label [[IFCONT8]] -// CHECK3: ifcont8: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK3-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK3-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK3-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK3: then8: -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK3-NEXT: br label [[IFCONT10:%.*]] -// CHECK3: else9: -// CHECK3-NEXT: br label [[IFCONT10]] -// CHECK3: ifcont10: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK3: then12: -// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK3-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK3-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK3-NEXT: br label [[IFCONT14:%.*]] -// CHECK3: else13: -// CHECK3-NEXT: br label [[IFCONT14]] -// CHECK3: ifcont14: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK3-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK3: .execute: -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK3: .omp.deinit: -// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK3-NEXT: br label [[DOTEXIT:%.*]] -// CHECK3: .exit: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__11 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22) -// CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__12 -// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK3: cond.true: -// CHECK3-NEXT: br label [[COND_END:%.*]] -// CHECK3: cond.false: -// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK3-NEXT: br label [[COND_END]] -// CHECK3: cond.end: -// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15) -// CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK3: .omp.reduction.then: -// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK3: cond.true9: -// CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK3-NEXT: br label [[COND_END11:%.*]] -// CHECK3: cond.false10: -// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK3-NEXT: br label [[COND_END11]] -// CHECK3: cond.end11: -// CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK3: .omp.reduction.done: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK3: then6: -// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK3-NEXT: br label [[IFCONT8:%.*]] -// CHECK3: else7: -// CHECK3-NEXT: br label [[IFCONT8]] -// CHECK3: ifcont8: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK3: then8: -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK3-NEXT: br label [[IFCONT10:%.*]] -// CHECK3: else9: -// CHECK3-NEXT: br label [[IFCONT10]] -// CHECK3: ifcont10: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK3: then12: -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK3-NEXT: br label [[IFCONT14:%.*]] -// CHECK3: else13: -// CHECK3-NEXT: br label [[IFCONT14]] -// CHECK3: ifcont14: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK3: then6: -// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK3-NEXT: br label [[IFCONT8:%.*]] -// CHECK3: else7: -// CHECK3-NEXT: br label [[IFCONT8]] -// CHECK3: ifcont8: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK3: then: -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK3-NEXT: br label [[IFCONT:%.*]] -// CHECK3: else: -// CHECK3-NEXT: br label [[IFCONT]] -// CHECK3: ifcont: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK3: then4: -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK3-NEXT: br label [[IFCONT6:%.*]] -// CHECK3: else5: -// CHECK3-NEXT: br label [[IFCONT6]] -// CHECK3: ifcont6: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK3: then8: -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK3-NEXT: br label [[IFCONT10:%.*]] -// CHECK3: else9: -// CHECK3-NEXT: br label [[IFCONT10]] -// CHECK3: ifcont10: -// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK3: then12: -// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK3-NEXT: br label [[IFCONT14:%.*]] -// CHECK3: else13: -// CHECK3-NEXT: br label [[IFCONT14]] -// CHECK3: ifcont14: -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK3-NEXT: ret void -// -// -// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 -// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK3-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker // CHECK4-SAME: () #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -4434,8 +83,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 // CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK4-NEXT: entry: @@ -4483,8 +130,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__ // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -4528,8 +173,6 @@ // CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2 // CHECK4-NEXT: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]]) // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4604,8 +247,6 @@ // CHECK4-NEXT: br label [[IFCONT6]] // CHECK4: ifcont6: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4665,8 +306,6 @@ // CHECK4-NEXT: br label [[PRECOND]] // CHECK4: exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4689,8 +328,6 @@ // CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 // CHECK4-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4713,8 +350,6 @@ // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4737,8 +372,6 @@ // CHECK4-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 // CHECK4-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4761,8 +394,6 @@ // CHECK4-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker // CHECK4-SAME: () #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -4796,8 +427,6 @@ // CHECK4-NEXT: br label [[DOTAWAIT_WORK]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 // CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -4859,8 +488,6 @@ // CHECK4-NEXT: br label [[DOTEXIT]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -4918,8 +545,6 @@ // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK4: .omp.reduction.done: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5015,8 +640,6 @@ // CHECK4-NEXT: br label [[IFCONT8]] // CHECK4: ifcont8: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5091,8 +714,6 @@ // CHECK4-NEXT: br label [[IFCONT14]] // CHECK4: ifcont14: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5121,8 +742,6 @@ // CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 // CHECK4-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5149,8 +768,6 @@ // CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5179,8 +796,6 @@ // CHECK4-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 // CHECK4-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5207,8 +822,6 @@ // CHECK4-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 // CHECK4-NEXT: call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 // CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -5234,8 +847,6 @@ // CHECK4-NEXT: br label [[DOTEXIT:%.*]] // CHECK4: .exit: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -5300,8 +911,6 @@ // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK4: .omp.reduction.done: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { // CHECK4-NEXT: entry: @@ -5373,8 +982,6 @@ // CHECK4-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // CHECK4: .omp.reduction.done: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5473,8 +1080,6 @@ // CHECK4-NEXT: br label [[IFCONT8]] // CHECK4: ifcont8: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5551,8 +1156,6 @@ // CHECK4-NEXT: br label [[IFCONT14]] // CHECK4: ifcont14: // CHECK4-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { // CHECK4-NEXT: entry: @@ -5651,3082 +1254,7265 @@ // CHECK4-NEXT: br label [[IFCONT8]] // CHECK4: ifcont8: // CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK4: then: +// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK4-NEXT: br label [[IFCONT:%.*]] +// CHECK4: else: +// CHECK4-NEXT: br label [[IFCONT]] +// CHECK4: ifcont: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK4: then4: +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK4-NEXT: br label [[IFCONT6:%.*]] +// CHECK4: else5: +// CHECK4-NEXT: br label [[IFCONT6]] +// CHECK4: ifcont6: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK4: then8: +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK4-NEXT: br label [[IFCONT10:%.*]] +// CHECK4: else9: +// CHECK4-NEXT: br label [[IFCONT10]] +// CHECK4: ifcont10: +// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK4: then12: +// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK4-NEXT: br label [[IFCONT14:%.*]] +// CHECK4: else13: +// CHECK4-NEXT: br label [[IFCONT14]] +// CHECK4: ifcont14: +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK4-NEXT: ret void +// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 +// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK4-NEXT: entry: +// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK4-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: br label [[PRECOND:%.*]] +// CHECK5: precond: +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK5: body: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK5-NEXT: br label [[PRECOND]] +// CHECK5: exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK5-SAME: () #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK5: .await.work: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK5: .select.workers: +// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK5: .execute.parallel: +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK5: .terminate.parallel: +// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK5: .barrier.parallel: +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK5: .worker: +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .mastercheck: +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK5: .master: +// CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK5: .termination.notifier: +// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK5-NEXT: br label [[DOTEXIT]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* +// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 +// CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 +// CHECK5-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK5: .execute: +// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK5: .omp.deinit: +// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK5-NEXT: br label [[DOTEXIT:%.*]] +// CHECK5: .exit: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK5: cond.true: +// CHECK5-NEXT: br label [[COND_END:%.*]] +// CHECK5: cond.false: +// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK5-NEXT: br label [[COND_END]] +// CHECK5: cond.end: +// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK5: .omp.reduction.then: +// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK5: cond.true9: +// CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK5-NEXT: br label [[COND_END11:%.*]] +// CHECK5: cond.false10: +// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK5-NEXT: br label [[COND_END11]] +// CHECK5: cond.end11: +// CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK5: .omp.reduction.done: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK5: then6: +// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK5-NEXT: br label [[IFCONT8:%.*]] +// CHECK5: else7: +// CHECK5-NEXT: br label [[IFCONT8]] +// CHECK5: ifcont8: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK5: then: +// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK5-NEXT: br label [[IFCONT:%.*]] +// CHECK5: else: +// CHECK5-NEXT: br label [[IFCONT]] +// CHECK5: ifcont: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK5: then4: +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK5-NEXT: br label [[IFCONT6:%.*]] +// CHECK5: else5: +// CHECK5-NEXT: br label [[IFCONT6]] +// CHECK5: ifcont6: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK5: then8: +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK5-NEXT: br label [[IFCONT10:%.*]] +// CHECK5: else9: +// CHECK5-NEXT: br label [[IFCONT10]] +// CHECK5: ifcont10: +// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK5: then12: +// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK5-NEXT: br label [[IFCONT14:%.*]] +// CHECK5: else13: +// CHECK5-NEXT: br label [[IFCONT14]] +// CHECK5: ifcont14: +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK5-NEXT: ret void +// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK5-NEXT: entry: +// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK5-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker +// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 +// CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* +// CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 +// CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 +// CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 +// CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 +// CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] +// CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: br label [[PRECOND:%.*]] +// CHECK6: precond: +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK6: body: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK6-NEXT: br label [[PRECOND]] +// CHECK6: exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker +// CHECK6-SAME: () #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK6: .await.work: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK6: .select.workers: +// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK6: .execute.parallel: +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK6: .terminate.parallel: +// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK6: .barrier.parallel: +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 +// CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK6: .worker: +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .mastercheck: +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK6: .master: +// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() +// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) +// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK6: .termination.notifier: +// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK6-NEXT: br label [[DOTEXIT]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* +// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 +// CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 +// CHECK6-NEXT: store i8 0, i8* [[C1]], align 4 +// CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 +// CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 +// CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 +// CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 +// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 +// CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 +// CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 +// CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 +// CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 +// CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK6: .execute: +// CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK6: .omp.deinit: +// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK6-NEXT: br label [[DOTEXIT:%.*]] +// CHECK6: .exit: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK6: cond.true: +// CHECK6-NEXT: br label [[COND_END:%.*]] +// CHECK6: cond.false: +// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK6-NEXT: br label [[COND_END]] +// CHECK6: cond.end: +// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK6: .omp.reduction.then: +// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK6: cond.true9: +// CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK6-NEXT: br label [[COND_END11:%.*]] +// CHECK6: cond.false10: +// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK6-NEXT: br label [[COND_END11]] +// CHECK6: cond.end11: +// CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK6: .omp.reduction.done: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK6: then6: +// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK6-NEXT: br label [[IFCONT8:%.*]] +// CHECK6: else7: +// CHECK6-NEXT: br label [[IFCONT8]] +// CHECK6: ifcont8: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK6: then: +// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK6-NEXT: br label [[IFCONT:%.*]] +// CHECK6: else: +// CHECK6-NEXT: br label [[IFCONT]] +// CHECK6: ifcont: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK6: then4: +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK6-NEXT: br label [[IFCONT6:%.*]] +// CHECK6: else5: +// CHECK6-NEXT: br label [[IFCONT6]] +// CHECK6: ifcont6: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK6: then8: +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK6-NEXT: br label [[IFCONT10:%.*]] +// CHECK6: else9: +// CHECK6-NEXT: br label [[IFCONT10]] +// CHECK6: ifcont10: +// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK6: then12: +// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK6-NEXT: br label [[IFCONT14:%.*]] +// CHECK6: else13: +// CHECK6-NEXT: br label [[IFCONT14]] +// CHECK6: ifcont14: +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK6-NEXT: ret void +// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK6-NEXT: entry: +// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* +// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 +// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 +// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK6-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_worker +// CHECK1-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK4-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK4-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK4-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK4: then: -// CHECK4-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK4-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK4-NEXT: br label [[IFCONT:%.*]] -// CHECK4: else: -// CHECK4-NEXT: br label [[IFCONT]] -// CHECK4: ifcont: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK4: then4: -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK4-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK4-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK4-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK4-NEXT: br label [[IFCONT6:%.*]] -// CHECK4: else5: -// CHECK4-NEXT: br label [[IFCONT6]] -// CHECK4: ifcont6: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK4-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK4: then8: -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK4-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK4-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK4-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK4-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK4-NEXT: br label [[IFCONT10:%.*]] -// CHECK4: else9: -// CHECK4-NEXT: br label [[IFCONT10]] -// CHECK4: ifcont10: -// CHECK4-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK4-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK4-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK4: then12: -// CHECK4-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK4-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK4-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK4-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK4-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK4-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK4-NEXT: br label [[IFCONT14:%.*]] -// CHECK4: else13: -// CHECK4-NEXT: br label [[IFCONT14]] -// CHECK4: ifcont14: -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20 +// CHECK1-SAME: (i64 [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_worker() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load double, double* [[CONV]], align 8 +// CHECK1-NEXT: [[E7:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E7]] to double* +// CHECK1-NEXT: store double [[TMP5]], double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR3]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E7]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 +// CHECK1-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i64 8) +// CHECK1-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* +// CHECK1-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK1-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] +// CHECK1-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK1-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK1-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK1-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK1-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK1-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK1-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK1-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK1-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK1-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK1-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK1-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: br label [[PRECOND:%.*]] +// CHECK1: precond: +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK1-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK1: body: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK1-NEXT: br label [[PRECOND]] +// CHECK1: exit: +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK1-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK4-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK4-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK4-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK4-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK4-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK1-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK1-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22 -// CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK4-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK4-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK4-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK4-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK4-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK4-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK4-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK4-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK4-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK4-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK4-NEXT: call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK4-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_worker +// CHECK1-SAME: () #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK1: .await.work: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8 +// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK1: .select.workers: +// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK1: .execute.parallel: +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK1: .terminate.parallel: +// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK1: .barrier.parallel: +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK5-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26 +// CHECK1-SAME: (i64 [[C:%.*]], i64 [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[C]], i64* [[C_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK1: .worker: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_worker() #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .mastercheck: +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK1: .master: +// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK1-NEXT: [[TMP5:%.*]] = load i8, i8* [[CONV]], align 8 +// CHECK1-NEXT: [[C8:%.*]] = call i8* @__kmpc_alloc_shared(i64 1) +// CHECK1-NEXT: store i8 [[TMP5]], i8* [[C8]], align 1 +// CHECK1-NEXT: [[TMP6:%.*]] = load float, float* [[CONV1]], align 8 +// CHECK1-NEXT: [[D9:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D9]] to float* +// CHECK1-NEXT: store float [[TMP6]], float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[TMP7]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D_ON_STACK]]) #[[ATTR3]] +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D9]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C8]]) +// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK1: .termination.notifier: +// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK1-NEXT: br label [[DOTEXIT]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK5-NEXT: [[E7:%.*]] = alloca double, align 8 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK5-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK5-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK5-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK5-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 +// CHECK1-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK1-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i64 1) +// CHECK1-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK1-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* +// CHECK1-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK1-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK1-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK1-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK1-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: store i8* [[C1]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 +// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK1-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK1-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] +// CHECK1-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D2]]) +// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK5-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK5-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK5-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK5-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 -// CHECK5-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 -// CHECK5-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK5-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* -// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK5-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 -// CHECK5-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK5-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 -// CHECK5-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] -// CHECK5-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK1-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK1-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK1-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK1-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK1-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK1-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK1-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK1-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK1-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 +// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 +// CHECK1-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK1-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK1-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK1-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK5-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK5-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK5-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK5-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK5-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK5-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK5-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK5-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK5-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK5-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK5-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK5-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK5-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK5-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK5-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK5-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK1-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK1-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK1-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK1-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK1-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK5-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK5-NEXT: br label [[PRECOND:%.*]] -// CHECK5: precond: -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK5-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK5: body: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK5-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK5-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK5-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK5-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK5-NEXT: br label [[PRECOND]] -// CHECK5: exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK1-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK1-NEXT: ret void +// // +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK5-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK5-NEXT: ret void // +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK1-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK1-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK1-NEXT: ret void // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK5-NEXT: ret void // +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK5-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK5-NEXT: ret void +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 +// CHECK1-SAME: (i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* +// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK1: .execute: +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK1-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK1-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK1: .omp.deinit: +// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK1-NEXT: br label [[DOTEXIT:%.*]] +// CHECK1: .exit: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK1-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK1-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK1-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK1-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK1-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK1-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK1-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK1-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK1: .omp.reduction.then: +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK1-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK1: cond.true9: +// CHECK1-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK1-NEXT: br label [[COND_END11:%.*]] +// CHECK1: cond.false10: +// CHECK1-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK1-NEXT: br label [[COND_END11]] +// CHECK1: cond.end11: +// CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK1-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK1: .omp.reduction.done: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK5: .await.work: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK5-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK5-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK5-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK5-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK5: .select.workers: -// CHECK5-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK5-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK5-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK5: .execute.parallel: -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK5-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK5-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK5: .terminate.parallel: -// CHECK5-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK5-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK5: .barrier.parallel: -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK5-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK5-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK5-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK5: .worker: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .mastercheck: -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK5-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK5-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK5-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK5-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK5-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK5: .master: -// CHECK5-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK5-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK5-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK5-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK5-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK5-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: store float [[TMP8]], float* [[D9]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK5-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK5: .termination.notifier: -// CHECK5-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK5-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK5-NEXT: br label [[DOTEXIT]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK1-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK1-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK1-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK1-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK1-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK1-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK1-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK1-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK1-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK1-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK1-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK1-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK1-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK1-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK1-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK1-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK1-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK1-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK1-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK1: then6: +// CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK1-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK1-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK1-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK1-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK1-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK1-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK1-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK1-NEXT: br label [[IFCONT8:%.*]] +// CHECK1: else7: +// CHECK1-NEXT: br label [[IFCONT8]] +// CHECK1: ifcont8: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK5-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* -// CHECK5-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 -// CHECK5-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 -// CHECK5-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK5-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 -// CHECK5-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK5-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK5-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 -// CHECK5-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 -// CHECK5-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) -// CHECK5-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK5-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK5-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 -// CHECK5-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK5-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK5-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK5-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK5-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 -// CHECK5-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK1: then: +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK1-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK1-NEXT: br label [[IFCONT:%.*]] +// CHECK1: else: +// CHECK1-NEXT: br label [[IFCONT]] +// CHECK1: ifcont: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK1: then4: +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK1-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK1-NEXT: br label [[IFCONT6:%.*]] +// CHECK1: else5: +// CHECK1-NEXT: br label [[IFCONT6]] +// CHECK1: ifcont6: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK1-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK1: then8: +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK1-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK1-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK1-NEXT: br label [[IFCONT10:%.*]] +// CHECK1: else9: +// CHECK1-NEXT: br label [[IFCONT10]] +// CHECK1: ifcont10: +// CHECK1-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK1: then12: +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK1-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK1-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK1-NEXT: br label [[IFCONT14:%.*]] +// CHECK1: else13: +// CHECK1-NEXT: br label [[IFCONT14]] +// CHECK1: ifcont14: +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK5-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK5-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK5-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK5-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK5-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK5-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK5-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK5-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK5-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK5-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK5-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK5-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK5-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK5-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK5-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK5-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK5-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK5-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK5: then6: -// CHECK5-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK5-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK5-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK5-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK5-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK5-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK5-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK5-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK5-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK5-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK5-NEXT: br label [[IFCONT8:%.*]] -// CHECK5: else7: -// CHECK5-NEXT: br label [[IFCONT8]] -// CHECK5: ifcont8: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK5-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK5-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK5-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK5: then8: -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK5-NEXT: br label [[IFCONT10:%.*]] -// CHECK5: else9: -// CHECK5-NEXT: br label [[IFCONT10]] -// CHECK5: ifcont10: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK5: then12: -// CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK5-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK5-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK5-NEXT: br label [[IFCONT14:%.*]] -// CHECK5: else13: -// CHECK5-NEXT: br label [[IFCONT14]] -// CHECK5: ifcont14: -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK1-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK1-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK1-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK1-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK5-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK5-NEXT: ret void +// CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK1-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 8 +// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK1-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK1-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_worker +// CHECK2-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK5-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK5-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20 +// CHECK2-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_worker() #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK2-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK2-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK2-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8) +// CHECK2-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* +// CHECK2-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK2-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK2-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] +// CHECK2-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[E1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK2-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK2-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK2-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK2-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK2-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK2-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK2-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK2-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK2-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK2-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK2-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK2-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK2-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK2-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK5-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK5-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK5: .execute: -// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK5-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK5: .omp.deinit: -// CHECK5-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK5-NEXT: br label [[DOTEXIT:%.*]] -// CHECK5: .exit: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: br label [[PRECOND:%.*]] +// CHECK2: precond: +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK2-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK2: body: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK2-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK2-NEXT: br label [[PRECOND]] +// CHECK2: exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK5-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK5-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK5-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK5-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) -// CHECK5-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK5-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK5-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK5-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK5-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK2-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK5-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK5-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK5-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK5-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK5-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK5-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK5-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK5: cond.true: -// CHECK5-NEXT: br label [[COND_END:%.*]] -// CHECK5: cond.false: -// CHECK5-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK5-NEXT: br label [[COND_END]] -// CHECK5: cond.end: -// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK5-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK5-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) -// CHECK5-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK5-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK5: .omp.reduction.then: -// CHECK5-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK5-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK5-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK5-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK5-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK5-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK5: cond.true9: -// CHECK5-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK5-NEXT: br label [[COND_END11:%.*]] -// CHECK5: cond.false10: -// CHECK5-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK5-NEXT: br label [[COND_END11]] -// CHECK5: cond.end11: -// CHECK5-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK5-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK5-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK5: .omp.reduction.done: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK5: then6: -// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK5-NEXT: br label [[IFCONT8:%.*]] -// CHECK5: else7: -// CHECK5-NEXT: br label [[IFCONT8]] -// CHECK5: ifcont8: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK2-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK5: then8: -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK5-NEXT: br label [[IFCONT10:%.*]] -// CHECK5: else9: -// CHECK5-NEXT: br label [[IFCONT10]] -// CHECK5: ifcont10: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK5: then12: -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK5-NEXT: br label [[IFCONT14:%.*]] -// CHECK5: else13: -// CHECK5-NEXT: br label [[IFCONT14]] -// CHECK5: ifcont14: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_worker +// CHECK2-SAME: () #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK2: .await.work: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK2: .select.workers: +// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK2: .execute.parallel: +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK2: .terminate.parallel: +// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK2: .barrier.parallel: +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26 +// CHECK2-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK2: .worker: +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_worker() #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .mastercheck: +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK2: .master: +// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK2-NEXT: [[TMP5:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK2-NEXT: [[C8:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK2-NEXT: store i8 [[TMP5]], i8* [[C8]], align 1 +// CHECK2-NEXT: [[TMP6:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK2-NEXT: [[D9:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D9]] to float* +// CHECK2-NEXT: store float [[TMP6]], float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D_ON_STACK]]) #[[ATTR3]] +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D9]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C8]]) +// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK2: .termination.notifier: +// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK2-NEXT: br label [[DOTEXIT]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK5-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK5-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK5-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK5-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK5-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK5-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK5-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK5-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK5-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK5-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK5-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK5-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK5-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK5-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK5-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK5-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK5-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK5-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK5-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK5-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK5-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK5-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK5-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK5-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK5-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK5-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK5-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK5-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK5-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK5-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK5-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK5: then6: -// CHECK5-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK5-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK5-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK5-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK5-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK5-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK5-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK5-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK5-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK5-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK5-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK5-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK5-NEXT: br label [[IFCONT8:%.*]] -// CHECK5: else7: -// CHECK5-NEXT: br label [[IFCONT8]] -// CHECK5: ifcont8: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK2-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK2-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK2-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK2-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* +// CHECK2-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK2-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK2-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK2-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK2-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK2-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 +// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK2-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] +// CHECK2-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D2]]) +// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK2-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK2-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK2-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK2-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK2-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK2-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK2-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK2-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK2-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK2-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK2-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK2-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK2-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK2-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK2-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK2-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK2-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK2-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK2-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK2-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK5-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK5-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK5: then: -// CHECK5-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK5-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK5-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK5-NEXT: br label [[IFCONT:%.*]] -// CHECK5: else: -// CHECK5-NEXT: br label [[IFCONT]] -// CHECK5: ifcont: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK5: then4: -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK5-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK5-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK5-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK5-NEXT: br label [[IFCONT6:%.*]] -// CHECK5: else5: -// CHECK5-NEXT: br label [[IFCONT6]] -// CHECK5: ifcont6: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK5-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK5: then8: -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK5-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK5-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK5-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK5-NEXT: br label [[IFCONT10:%.*]] -// CHECK5: else9: -// CHECK5-NEXT: br label [[IFCONT10]] -// CHECK5: ifcont10: -// CHECK5-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK5-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK5-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK5: then12: -// CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK5-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK5-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK5-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK5-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK5-NEXT: br label [[IFCONT14:%.*]] -// CHECK5: else13: -// CHECK5-NEXT: br label [[IFCONT14]] -// CHECK5: ifcont14: -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK2-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK2-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK2-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK5-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK5-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK5-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK5-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK5-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 -// CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK5-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK5-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK5-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK5-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK5-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK5-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK5-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK5-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK5-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 +// CHECK2-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK2: .execute: +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK2-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK2: .omp.deinit: +// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK2-NEXT: br label [[DOTEXIT:%.*]] +// CHECK2: .exit: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker -// CHECK6-SAME: () #[[ATTR0:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK2-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK2-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK2-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK2-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK2-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23 -// CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK6-NEXT: [[E7:%.*]] = alloca double, align 8 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 -// CHECK6-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] -// CHECK6-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK6-NEXT: store double [[TMP7]], double* [[E7]], align 8 -// CHECK6-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK2-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK2-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK2-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK2: cond.true: +// CHECK2-NEXT: br label [[COND_END:%.*]] +// CHECK2: cond.false: +// CHECK2-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK2-NEXT: br label [[COND_END]] +// CHECK2: cond.end: +// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK2-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK2-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK2: .omp.reduction.then: +// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK2-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK2-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK2: cond.true9: +// CHECK2-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK2-NEXT: br label [[COND_END11:%.*]] +// CHECK2: cond.false10: +// CHECK2-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK2-NEXT: br label [[COND_END11]] +// CHECK2: cond.end11: +// CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK2-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK2: .omp.reduction.done: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__ -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK6-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty* -// CHECK6-NEXT: [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0 -// CHECK6-NEXT: store double 0.000000e+00, double* [[E1]], align 8 -// CHECK6-NEXT: [[TMP3:%.*]] = load double, double* [[E1]], align 8 -// CHECK6-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00 -// CHECK6-NEXT: store double [[ADD]], double* [[E1]], align 8 -// CHECK6-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast double* [[E1]] to i8* -// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) -// CHECK6-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 -// CHECK6-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP0]], align 8 -// CHECK6-NEXT: [[TMP13:%.*]] = load double, double* [[E1]], align 8 -// CHECK6-NEXT: [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]] -// CHECK6-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]]) -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* -// CHECK6-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* -// CHECK6-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) -// CHECK6-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 -// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK6-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] -// CHECK6-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 -// CHECK6-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] -// CHECK6-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] -// CHECK6-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] -// CHECK6-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] -// CHECK6-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] -// CHECK6-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 -// CHECK6-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 -// CHECK6-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* -// CHECK6-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* -// CHECK6-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 -// CHECK6-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK2-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK2-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK2-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK2-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK2-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK2-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK2-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK2-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK2-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK2-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK2-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK2-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK2-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK2-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK2-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK2-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK2-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK2-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK2-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK2-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK2-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK2-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK2-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK2: then6: +// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK2-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK2-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK2-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK2-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK2-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK2-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK2-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK2-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK2-NEXT: br label [[IFCONT8:%.*]] +// CHECK2: else7: +// CHECK2-NEXT: br label [[IFCONT8]] +// CHECK2: ifcont8: +// CHECK2-NEXT: ret void +// +// +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK2: then: +// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK2-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK2-NEXT: br label [[IFCONT:%.*]] +// CHECK2: else: +// CHECK2-NEXT: br label [[IFCONT]] +// CHECK2: ifcont: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK2: then4: +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK2-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK2-NEXT: br label [[IFCONT6:%.*]] +// CHECK2: else5: +// CHECK2-NEXT: br label [[IFCONT6]] +// CHECK2: ifcont6: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK2-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK2: then8: +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK2-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK2-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK2-NEXT: br label [[IFCONT10:%.*]] +// CHECK2: else9: +// CHECK2-NEXT: br label [[IFCONT10]] +// CHECK2: ifcont10: +// CHECK2-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK2-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK2: then12: +// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK2-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK2-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK2-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK2-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK2-NEXT: br label [[IFCONT14:%.*]] +// CHECK2: else13: +// CHECK2-NEXT: br label [[IFCONT14]] +// CHECK2: ifcont14: +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK6-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 -// CHECK6-NEXT: br label [[PRECOND:%.*]] -// CHECK6: precond: -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 -// CHECK6-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] -// CHECK6: body: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 -// CHECK6-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] -// CHECK6-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 -// CHECK6-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 -// CHECK6-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 -// CHECK6-NEXT: br label [[PRECOND]] -// CHECK6: exit: -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 -// CHECK6-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK2-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK2-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK2-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK2-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 -// CHECK6-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 -// CHECK6-NEXT: ret void +// CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK2-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK2-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK2-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_worker +// CHECK3-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20 +// CHECK3-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[E7:%.*]] = alloca double, align 8 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_worker() #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1 +// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]] +// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP7:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK3-NEXT: store double [[TMP7]], double* [[E7]], align 8 +// CHECK3-NEXT: store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker -// CHECK6-SAME: () #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: store i8* null, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: br label [[DOTAWAIT_WORK:%.*]] -// CHECK6: .await.work: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) -// CHECK6-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 -// CHECK6-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 -// CHECK6-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null -// CHECK6-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] -// CHECK6: .select.workers: -// CHECK6-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 -// CHECK6-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 -// CHECK6-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] -// CHECK6: .execute.parallel: -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* -// CHECK6-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) -// CHECK6-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] -// CHECK6: .terminate.parallel: -// CHECK6-NEXT: call void @__kmpc_kernel_end_parallel() -// CHECK6-NEXT: br label [[DOTBARRIER_PARALLEL]] -// CHECK6: .barrier.parallel: -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTAWAIT_WORK]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK3-NEXT: [[E1:%.*]] = call i8* @__kmpc_alloc_shared(i32 8) +// CHECK3-NEXT: [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double* +// CHECK3-NEXT: store double 0.000000e+00, double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK3-NEXT: store double [[ADD]], double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 2048, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func) +// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1 +// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP10:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK3-NEXT: [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8 +// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]] +// CHECK3-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[E1]]) +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29 -// CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* -// CHECK6-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] -// CHECK6-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] -// CHECK6-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] -// CHECK6: .worker: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .mastercheck: -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 -// CHECK6-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 -// CHECK6-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 -// CHECK6-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] -// CHECK6-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] -// CHECK6-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] -// CHECK6: .master: -// CHECK6-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] -// CHECK6-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK6-NEXT: [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* -// CHECK6-NEXT: [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4 -// CHECK6-NEXT: [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: store i8 [[TMP7]], i8* [[C8]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = load float, float* [[CONV1]], align 4 -// CHECK6-NEXT: [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: store float [[TMP8]], float* [[D9]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]] -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]]) -// CHECK6-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] -// CHECK6: .termination.notifier: -// CHECK6-NEXT: call void @__kmpc_kernel_deinit(i16 1) -// CHECK6-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK6-NEXT: br label [[DOTEXIT]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK3-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK3-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK3-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK3-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK3-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK3-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK3-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK3-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK3-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK3-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK3-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 -// CHECK6-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1) -// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2* -// CHECK6-NEXT: [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1 -// CHECK6-NEXT: [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0 -// CHECK6-NEXT: store i8 0, i8* [[C1]], align 4 -// CHECK6-NEXT: store float 1.000000e+00, float* [[D2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = sext i8 [[TMP4]] to i32 -// CHECK6-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 -// CHECK6-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 -// CHECK6-NEXT: store i8 [[CONV3]], i8* [[C1]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = load float, float* [[D2]], align 4 -// CHECK6-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01 -// CHECK6-NEXT: store float [[MUL]], float* [[D2]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: store i8* [[C1]], i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[D2]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) -// CHECK6-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1 -// CHECK6-NEXT: br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1 -// CHECK6-NEXT: [[CONV4:%.*]] = sext i8 [[TMP15]] to i32 -// CHECK6-NEXT: [[TMP16:%.*]] = load i8, i8* [[C1]], align 4 -// CHECK6-NEXT: [[CONV5:%.*]] = sext i8 [[TMP16]] to i32 -// CHECK6-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] -// CHECK6-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 -// CHECK6-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load float, float* [[TMP1]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load float, float* [[D2]], align 4 -// CHECK6-NEXT: [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]]) -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: br label [[PRECOND:%.*]] +// CHECK3: precond: +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK3-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK3: body: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK3-NEXT: br label [[PRECOND]] +// CHECK3: exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 -// CHECK6-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) -// CHECK6-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 -// CHECK6-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK6-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* -// CHECK6-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 -// CHECK6-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* -// CHECK6-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* -// CHECK6-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK6-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 -// CHECK6-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 -// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 -// CHECK6-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK6-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 -// CHECK6-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] -// CHECK6-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 -// CHECK6-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] -// CHECK6-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] -// CHECK6-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] -// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] -// CHECK6-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] -// CHECK6-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK6: then6: -// CHECK6-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 -// CHECK6-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 -// CHECK6-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 -// CHECK6-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 -// CHECK6-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 -// CHECK6-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 -// CHECK6-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* -// CHECK6-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* -// CHECK6-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 -// CHECK6-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 -// CHECK6-NEXT: br label [[IFCONT8:%.*]] -// CHECK6: else7: -// CHECK6-NEXT: br label [[IFCONT8]] -// CHECK6: ifcont8: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 +// CHECK3-NEXT: store double [[TMP12]], double* [[TMP11]], align 128 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* -// CHECK6-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 -// CHECK6-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 -// CHECK6-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK6: then8: -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 -// CHECK6-NEXT: br label [[IFCONT10:%.*]] -// CHECK6: else9: -// CHECK6-NEXT: br label [[IFCONT10]] -// CHECK6: ifcont10: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK6: then12: -// CHECK6-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* -// CHECK6-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 -// CHECK6-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 -// CHECK6-NEXT: br label [[IFCONT14:%.*]] -// CHECK6: else13: -// CHECK6-NEXT: br label [[IFCONT14]] -// CHECK6: ifcont14: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double* +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[TMP11]], align 128 +// CHECK3-NEXT: store double [[TMP12]], double* [[TMP10]], align 8 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 -// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 -// CHECK6-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_worker +// CHECK3-SAME: () #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]] +// CHECK3: .await.work: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]]) +// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8 +// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4 +// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null +// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]] +// CHECK3: .select.workers: +// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1 +// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 +// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] +// CHECK3: .execute.parallel: +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* +// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) +// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] +// CHECK3: .terminate.parallel: +// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel() +// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]] +// CHECK3: .barrier.parallel: +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTAWAIT_WORK]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26 +// CHECK3-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[C]], i32* [[C_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* +// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK3: .worker: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_worker() #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .mastercheck: +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1 +// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1 +// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1 +// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]] +// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]] +// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK3: .master: +// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]] +// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1) +// CHECK3-NEXT: [[TMP5:%.*]] = load i8, i8* [[CONV]], align 4 +// CHECK3-NEXT: [[C8:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK3-NEXT: store i8 [[TMP5]], i8* [[C8]], align 1 +// CHECK3-NEXT: [[TMP6:%.*]] = load float, float* [[CONV1]], align 4 +// CHECK3-NEXT: [[D9:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D9]] to float* +// CHECK3-NEXT: store float [[TMP6]], float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[TMP7]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D_ON_STACK]]) #[[ATTR3]] +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D9]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C8]]) +// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK3: .termination.notifier: +// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) +// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK3-NEXT: br label [[DOTEXIT]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK3-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK3-NEXT: [[C1:%.*]] = call i8* @__kmpc_alloc_shared(i32 1) +// CHECK3-NEXT: [[D2:%.*]] = call i8* @__kmpc_alloc_shared(i32 4) +// CHECK3-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float* +// CHECK3-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK3-NEXT: store float 1.000000e+00, float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK3-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK3-NEXT: [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK3-NEXT: store float [[MUL]], float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 2048, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8) +// CHECK3-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1 +// CHECK3-NEXT: br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP14]] to i32 +// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK3-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK3-NEXT: [[TMP15:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4 +// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]] +// CHECK3-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D2]]) +// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C1]]) +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 -// CHECK6-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 -// CHECK6-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK3-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK3-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK3-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK3-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK3-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK3-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK3-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK3-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK3-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK3-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK3-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK3-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK3-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK3-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK3-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK3-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK3-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK3-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1 +// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 128 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP14]], align 4 +// CHECK3-NEXT: store float [[TMP16]], float* [[TMP15]], align 128 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36 -// CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK6-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 -// CHECK6-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) -// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() -// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]] -// CHECK6: .execute: -// CHECK6-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) -// CHECK6-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[DOTOMP_DEINIT:%.*]] -// CHECK6: .omp.deinit: -// CHECK6-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) -// CHECK6-NEXT: br label [[DOTEXIT:%.*]] -// CHECK6: .exit: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK6-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK6-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK6-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK6-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) -// CHECK6-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 -// CHECK6-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] -// CHECK6-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 -// CHECK6-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] -// CHECK6-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128 +// CHECK3-NEXT: store i8 [[TMP11]], i8* [[TMP9]], align 1 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float* +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 128 +// CHECK3-NEXT: store float [[TMP16]], float* [[TMP14]], align 4 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10 -// CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 -// CHECK6-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 -// CHECK6-NEXT: [[A1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[B2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 -// CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 -// CHECK6-NEXT: store i32 0, i32* [[A1]], align 4 -// CHECK6-NEXT: store i16 -32768, i16* [[B2]], align 2 -// CHECK6-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK6-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 -// CHECK6-NEXT: store i32 [[OR]], i32* [[A1]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 -// CHECK6-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] -// CHECK6-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// CHECK6: cond.true: -// CHECK6-NEXT: br label [[COND_END:%.*]] -// CHECK6: cond.false: -// CHECK6-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 -// CHECK6-NEXT: br label [[COND_END]] -// CHECK6: cond.end: -// CHECK6-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] -// CHECK6-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 -// CHECK6-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 -// CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* -// CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) -// CHECK6-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 -// CHECK6-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] -// CHECK6: .omp.reduction.then: -// CHECK6-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 -// CHECK6-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] -// CHECK6-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 -// CHECK6-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 -// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 -// CHECK6-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] -// CHECK6-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// CHECK6: cond.true9: -// CHECK6-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK6-NEXT: br label [[COND_END11:%.*]] -// CHECK6: cond.false10: -// CHECK6-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 -// CHECK6-NEXT: br label [[COND_END11]] -// CHECK6: cond.end11: -// CHECK6-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] -// CHECK6-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 -// CHECK6-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) -// CHECK6-NEXT: br label [[DOTOMP_REDUCTION_DONE]] -// CHECK6: .omp.reduction.done: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]] +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33 +// CHECK3-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK3: .execute: +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) +// CHECK3-NEXT: store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK3-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK3: .omp.deinit: +// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK3-NEXT: br label [[DOTEXIT:%.*]] +// CHECK3: .exit: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK6: then6: -// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK6-NEXT: br label [[IFCONT8:%.*]] -// CHECK6: else7: -// CHECK6-NEXT: br label [[IFCONT8]] -// CHECK6: ifcont8: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20) +// CHECK3-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1 +// CHECK3-NEXT: br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[OR]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP19]] to i32 +// CHECK3-NEXT: [[TMP20:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP20]] to i32 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]] +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP22:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i16 [[COND]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK6: then8: -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK6-NEXT: br label [[IFCONT10:%.*]] -// CHECK6: else9: -// CHECK6-NEXT: br label [[IFCONT10]] -// CHECK6: ifcont10: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK6: then12: -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK6-NEXT: br label [[IFCONT14:%.*]] -// CHECK6: else13: -// CHECK6-NEXT: br label [[IFCONT14]] -// CHECK6: ifcont14: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10 +// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK3-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK3-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK3-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13) +// CHECK3-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK3-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK3: .omp.reduction.then: +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK3-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK3: cond.true9: +// CHECK3-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK3-NEXT: br label [[COND_END11:%.*]] +// CHECK3: cond.false10: +// CHECK3-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK3-NEXT: br label [[COND_END11]] +// CHECK3: cond.end11: +// CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK3-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK3: .omp.reduction.done: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 -// CHECK6-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 -// CHECK6-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* -// CHECK6-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 -// CHECK6-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) -// CHECK6-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* -// CHECK6-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 -// CHECK6-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* -// CHECK6-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 -// CHECK6-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 -// CHECK6-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK6-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 -// CHECK6-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) -// CHECK6-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 -// CHECK6-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 -// CHECK6-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 -// CHECK6-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 -// CHECK6-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* -// CHECK6-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 -// CHECK6-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 -// CHECK6-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] -// CHECK6-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 -// CHECK6-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 -// CHECK6-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 -// CHECK6-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] -// CHECK6-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 -// CHECK6-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] -// CHECK6-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] -// CHECK6-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] -// CHECK6-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* -// CHECK6-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 -// CHECK6-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] -// CHECK6-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] -// CHECK6-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] -// CHECK6: then6: -// CHECK6-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 -// CHECK6-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 -// CHECK6-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* -// CHECK6-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* -// CHECK6-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 -// CHECK6-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 -// CHECK6-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 -// CHECK6-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 -// CHECK6-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* -// CHECK6-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* -// CHECK6-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 -// CHECK6-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 -// CHECK6-NEXT: br label [[IFCONT8:%.*]] -// CHECK6: else7: -// CHECK6-NEXT: br label [[IFCONT8]] -// CHECK6: ifcont8: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 -// CHECK6-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK6-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] -// CHECK6: then: -// CHECK6-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 -// CHECK6-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 -// CHECK6-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 -// CHECK6-NEXT: br label [[IFCONT:%.*]] -// CHECK6: else: -// CHECK6-NEXT: br label [[IFCONT]] -// CHECK6: ifcont: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] -// CHECK6: then4: -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 -// CHECK6-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* -// CHECK6-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 -// CHECK6-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 -// CHECK6-NEXT: br label [[IFCONT6:%.*]] -// CHECK6: else5: -// CHECK6-NEXT: br label [[IFCONT6]] -// CHECK6: ifcont6: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 -// CHECK6-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] -// CHECK6: then8: -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 -// CHECK6-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* -// CHECK6-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] -// CHECK6-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 -// CHECK6-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 -// CHECK6-NEXT: br label [[IFCONT10:%.*]] -// CHECK6: else9: -// CHECK6-NEXT: br label [[IFCONT10]] -// CHECK6: ifcont10: -// CHECK6-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) -// CHECK6-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] -// CHECK6-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] -// CHECK6: then12: -// CHECK6-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] -// CHECK6-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* -// CHECK6-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 -// CHECK6-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* -// CHECK6-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 -// CHECK6-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 -// CHECK6-NEXT: br label [[IFCONT14:%.*]] -// CHECK6: else13: -// CHECK6-NEXT: br label [[IFCONT14]] -// CHECK6: ifcont14: -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK3-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK3-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK3-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK3-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK3-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK3-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK3-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK3-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK3-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK3-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK3-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK3-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK3-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK3-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK3-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK3-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK3-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK3-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]] +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK3-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK3-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK3: then6: +// CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK3-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK3-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK3-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK3-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK3-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK3-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK3-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK3-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK3-NEXT: br label [[IFCONT8:%.*]] +// CHECK3: else7: +// CHECK3-NEXT: br label [[IFCONT8]] +// CHECK3: ifcont8: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]]) +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK3: then: +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK3-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK3-NEXT: br label [[IFCONT:%.*]] +// CHECK3: else: +// CHECK3-NEXT: br label [[IFCONT]] +// CHECK3: ifcont: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK3: then4: +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK3-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK3-NEXT: br label [[IFCONT6:%.*]] +// CHECK3: else5: +// CHECK3-NEXT: br label [[IFCONT6]] +// CHECK3: ifcont6: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK3-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK3: then8: +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK3-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK3-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK3-NEXT: br label [[IFCONT10:%.*]] +// CHECK3: else9: +// CHECK3-NEXT: br label [[IFCONT10]] +// CHECK3: ifcont10: +// CHECK3-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]]) +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK3: then12: +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK3-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK3-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK3-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK3-NEXT: br label [[IFCONT14:%.*]] +// CHECK3: else13: +// CHECK3-NEXT: br label [[IFCONT14]] +// CHECK3: ifcont14: +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 -// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 -// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP11]], align 128 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2 +// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP16]], align 128 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* -// CHECK6-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 -// CHECK6-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 -// CHECK6-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 -// CHECK6-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 -// CHECK6-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] -// CHECK6-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 -// CHECK6-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32* +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128 +// CHECK3-NEXT: store i32 [[TMP12]], i32* [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16* +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]] +// CHECK3-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128 +// CHECK3-NEXT: store i16 [[TMP17]], i16* [[TMP15]], align 2 +// CHECK3-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 -// CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -// CHECK6-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 -// CHECK6-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 -// CHECK6-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4* -// CHECK6-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 -// CHECK6-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 -// CHECK6-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0 -// CHECK6-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* -// CHECK6-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 -// CHECK6-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 -// CHECK6-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1 -// CHECK6-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] -// CHECK6-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* -// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 -// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* -// CHECK6-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK6-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] -// CHECK6-NEXT: ret void +// CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20 +// CHECK3-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i8*, align 4 +// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: store i8* [[TMP2]], i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1* +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +// CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]] +// CHECK3-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8* +// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]] +// CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp --- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp @@ -97,7 +97,6 @@ // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG45]] // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG45]] // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG45]] -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG45]] // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG45]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) @@ -321,7 +320,6 @@ // CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG135]] // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG135]] // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG135]] -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG135]] // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG135]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) @@ -539,7 +537,6 @@ // CHECK1-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG210]] // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG210]] // CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG210]] -// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG210]] // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG210]] // CHECK1: .execute: // CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB5:[0-9]+]]) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -429,20 +429,14 @@ GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) -__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr) +__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) +__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) __OMP_RTL(__kmpc_end_sharing_variables, false, Void, ) __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr) __OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32) __OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, ) -__OMP_RTL(__kmpc_get_team_static_memory, false, Void, Int16, VoidPtr, SizeTy, - Int16, VoidPtrPtr) -__OMP_RTL(__kmpc_restore_team_static_memory, false, Void, Int16, Int16) __OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1122,9 +1122,8 @@ } void analysisGlobalization() { - RuntimeFunction GlobalizationRuntimeIDs[] = { - OMPRTL___kmpc_data_sharing_coalesced_push_stack, - OMPRTL___kmpc_data_sharing_push_stack}; + RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared, + OMPRTL___kmpc_free_shared}; for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll --- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll @@ -2,144 +2,41 @@ ; ModuleID = 'declare_target_codegen_globalization.cpp' source_filename = "declare_target_codegen_globalization.cpp" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" +target triple = "nvptx64" -%struct.ident_t = type { i32, i32, i32, i32, i8* } -%struct._globalized_locals_ty = type { [32 x i32] } +; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. -@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1 -@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8 -@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0 -@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata" +@S = external local_unnamed_addr global i8* -; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. -; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. - -; Function Attrs: norecurse nounwind -define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 { +define void @foo() { entry: - %nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13 - tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12 - tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12 - %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) - %1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4 - %.not.i.i = icmp eq i8 %1, 0 - br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit - -.non-spmd2.i.i: ; preds = %entry - %2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12 - tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14 - br label %__omp_outlined__.exit, !dbg !14 - -__omp_outlined__.exit: ; preds = %entry, %.non-spmd2.i.i - tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19 - ret void, !dbg !20 + %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8 + %x_on_stack = bitcast i8* %0 to i32* + %1 = bitcast i32* %x_on_stack to i8* + call void @share(i8* %1) + call void @__kmpc_free_shared(i8* %0) + ret void } -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 - -declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr - -declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr - -; Function Attrs: norecurse nounwind readonly -define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 { +define void @share(i8* %x) { entry: - %0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23 - ret i32 %0, !dbg !27 + store i8* %x, i8** @S + ret void } -; Function Attrs: nounwind -define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 { -entry: - %a1 = alloca i32, align 4 - %0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4 - %.not = icmp eq i8 %0, 0 - br i1 %.not, label %.non-spmd, label %.exit - -.non-spmd: ; preds = %entry - %1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31 - %2 = bitcast i8* %1 to %struct._globalized_locals_ty* - br label %.exit +declare i8* @__kmpc_alloc_shared(i64) -.exit: ; preds = %entry, %.non-spmd - %_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ] - %nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28 - %nvptx_lane_id = and i32 %nvptx_tid, 31 - %3 = zext i32 %nvptx_lane_id to i64 - %4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3 - %5 = select i1 %.not, i32* %4, i32* %a1 - %6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23 - br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31 +declare void @__kmpc_free_shared(i8*) -.non-spmd2: ; preds = %.exit - %7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31 - tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31 - br label %.exit3, !dbg !31 - -.exit3: ; preds = %.non-spmd2, %.exit - ret i32 %6, !dbg !31 -} - -declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr - -declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr - -declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 - -declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr - -; Function Attrs: nounwind -declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4 - -declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr - -attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } !llvm.dbg.cu = !{!0} -!omp_offload.info = !{!3} -!nvvm.annotations = !{!4} -!llvm.module.flags = !{!5, !6, !7, !8} -!llvm.ident = !{!9} +!llvm.module.flags = !{!3, !4} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP") +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c") !2 = !{} -!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0} -!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1} -!5 = !{i32 7, !"Dwarf Version", i32 2} -!6 = !{i32 2, !"Debug Info Version", i32 3} -!7 = !{i32 1, !"wchar_size", i32 4} -!8 = !{i32 7, !"PIC Level", i32 2} -!9 = !{!"clang version 12.0.0"} -!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!11 = !DISubroutineType(types: !2) -!12 = !DILocation(line: 17, column: 1, scope: !10) -!13 = !{i32 1, i32 1025} -!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16) -!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18) -!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!18 = distinct !DILocation(line: 17, column: 1, scope: !10) -!19 = !DILocation(line: 17, column: 40, scope: !10) -!20 = !DILocation(line: 21, column: 3, scope: !10) -!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!22 = !DILocation(line: 5, column: 26, scope: !21) -!23 = !{!24, !24, i64 0} -!24 = !{!"int", !25, i64 0} -!25 = !{!"omnipotent char", !26, i64 0} -!26 = !{!"Simple C++ TBAA"} -!27 = !DILocation(line: 5, column: 19, scope: !21) -!28 = !{i32 0, i32 1024} -!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30) -!30 = distinct !DILocation(line: 9, column: 10, scope: !15) -!31 = !DILocation(line: 10, column: 1, scope: !15) - +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 5, column: 7, scope: !6) diff --git a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll --- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll +++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll @@ -7,11 +7,11 @@ define void @foo() { entry: - %x = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0), !dbg !7 + %x = call i8* @__kmpc_alloc_shared(i64 4), !dbg !7 %x_on_stack = bitcast i8* %x to i32* %0 = bitcast i32* %x_on_stack to i8* call void @use(i8* %0) - call void @__kmpc_data_sharing_pop_stack(i8* %x) + call void @__kmpc_free_shared(i8* %x) ret void } @@ -22,7 +22,7 @@ ret void } -define internal i8* @__kmpc_data_sharing_push_stack(i64 %DataSize, i16 %shared) { +define internal i8* @__kmpc_alloc_shared(i64 %DataSize) { entry: %call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11 ret i8* %call @@ -31,7 +31,7 @@ ; Function Attrs: convergent nounwind mustprogress declare i8* @_Z10SafeMallocmPKc(i64 %size, i8* nocapture readnone %msg) -declare void @__kmpc_data_sharing_pop_stack(i8*) +declare void @__kmpc_free_shared(i8*) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4}