Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/MemoryLocation.h" @@ -49,6 +50,7 @@ #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include @@ -816,6 +818,9 @@ // TODO: This should be folded into buildCustomStateMachine. Changed |= rewriteDeviceCodeStateMachine(); + if (HideMemoryTransferLatency) + Changed |= hideMemTransfersLatency(); + if (remarksEnabled()) analysisGlobalization(); @@ -833,8 +838,6 @@ Changed |= deleteParallelRegions(); - if (HideMemoryTransferLatency) - Changed |= hideMemTransfersLatency(); Changed |= deduplicateRuntimeCalls(); if (EnableParallelRegionMerging) { if (mergeParallelRegions()) { @@ -1375,40 +1378,315 @@ return Changed; } + bool + splitMapperToIssueAndWait(CallInst *RuntimeCall, + std::pair &IssueWaitPair) { + auto &IRBuilder = OMPInfoCache.OMPBuilder; + Function *F = RuntimeCall->getCaller(); + const DataLayout &DL = M.getDataLayout(); + Instruction *FirstInst = &(F->getEntryBlock().front()); + AllocaInst *Handle = new AllocaInst( + IRBuilder.AsyncInfo, DL.getAllocaAddrSpace(), "handle", FirstInst); + + FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper_issue); + + // Change RuntimeCall call site for its asynchronous version. + SmallVector Args; + for (auto &Arg : RuntimeCall->args()) + Args.push_back(Arg.get()); + Args.push_back(Handle); + + auto IssueCall = + CallInst::Create(IssueDecl, Args, /*NameStr=*/"", RuntimeCall); + assert(IssueCall && "Incomplete spliting"); + IssueWaitPair.first = IssueCall; + + FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper_wait); + + Value *WaitParams[3] = { + RuntimeCall->getArgOperand(0), + RuntimeCall->getArgOperand(OffloadArray::DeviceIDArgNum), Handle}; + auto WaitCall = CallInst::Create(WaitDecl, WaitParams, "", RuntimeCall); + assert(WaitCall && "Incomplete spliting"); + IssueWaitPair.second = WaitCall; + + RuntimeCall->eraseFromParent(); + + return true; + } + + bool getUseTreeDuplicated( + SmallVector> &UseTreeDuplicated, + Instruction *I, Instruction *MoveI) { + UseTreeDuplicated.push_back({I, nullptr}); + + for (unsigned int i = 0; i < UseTreeDuplicated.size(); ++i) + for (User *U : dyn_cast(UseTreeDuplicated[i].first)->users()) { + if (dyn_cast(U)) + continue; + assert((dyn_cast(U) || dyn_cast(U) || + dyn_cast(U)) && + "Problem in data mapping"); + auto UInstr = cast(U); + if (isSafeToSpeculativelyExecute(UInstr)) // duplicate + UseTreeDuplicated.push_back({UInstr, UInstr->clone()}); + else + UseTreeDuplicated.push_back({UInstr, nullptr}); + } + + int VecSize = UseTreeDuplicated.size(); + for (int i = 1; i < VecSize; i++) { + // insert the duplicate if it exists + if (UseTreeDuplicated[i].second) + UseTreeDuplicated[i].second->insertBefore(UseTreeDuplicated[i].first); + // move the instruction to the right BB + UseTreeDuplicated[i].first->moveBefore(MoveI); + } + + return true; + } + + Function *outlineMapperIssueRT(CallInst &RuntimeCall) { + BasicBlock *RuntimeCallBB = RuntimeCall.getParent(); + Value *BasePtrsArg = + RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); + // i8** %offload_ptrs. + Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); + // i8** %offload_sizes. + Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); + + auto *BasePtrsArray = + dyn_cast(getUnderlyingObject(BasePtrsArg)); + auto *PtrsArray = dyn_cast(getUnderlyingObject(PtrsArg)); + auto *SizesArray = dyn_cast(getUnderlyingObject(SizesArg)); + if (!BasePtrsArray || !PtrsArray) + return nullptr; + + // split bb to be outlined + BasicBlock *MapperBB = + RuntimeCallBB->splitBasicBlock(&RuntimeCall, "mapper.bb"); + BasicBlock *NextBB = + MapperBB->splitBasicBlock(RuntimeCall.getNextNode(), "next.bb"); + + SmallVector> BasePtrsUses; + SmallVector> PtrsUses; + + getUseTreeDuplicated(BasePtrsUses, BasePtrsArray, (&RuntimeCall)); + getUseTreeDuplicated(PtrsUses, PtrsArray, (&RuntimeCall)); + + for (auto Bpu : BasePtrsUses) + if (Bpu.second) + Bpu.first->replaceUsesOutsideBlock(Bpu.second, MapperBB); + for (auto Pu : PtrsUses) + if (Pu.second) + Pu.first->replaceUsesOutsideBlock(Pu.second, MapperBB); + + // For cases with constant array size. + if (SizesArray) { + SmallVector> SizesArrayUses; + getUseTreeDuplicated(SizesArrayUses, SizesArray, (&RuntimeCall)); + for (auto Sau : SizesArrayUses) + if (Sau.second) + Sau.first->replaceUsesOutsideBlock(Sau.second, MapperBB); + } + + SmallVector ExtractBB{MapperBB}; + CodeExtractor CE(ExtractBB); + CodeExtractorAnalysisCache CEAC(*(MapperBB->getParent())); + Function *OutlinedFunc = CE.extractCodeRegion(CEAC); + if (!OutlinedFunc) + return nullptr; + + OutlinedFunc->setName("__openmp_mapper_issue_wrapper_" + + MapperBB->getParent()->getName()); + + MergeBlockIntoPredecessor(NextBB); + MergeBlockIntoPredecessor(RuntimeCall.getParent()); + + // Annotating the outlined function + OutlinedFunc->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + OutlinedFunc->addFnAttr(Attribute::AlwaysInline); + int i = 0; + for (auto &A : OutlinedFunc->args()) { + if (A.getType()->isPointerTy()) + OutlinedFunc->addParamAttr(i, Attribute::NoCapture); + i++; + } + return OutlinedFunc; + } + // returns true if \p CI can move before or after \p I + bool canMoveThrough(CallInst *CI, Instruction *I, AliasAnalysis &AA) { + if (!(I->mayHaveSideEffects())) + return true; + + auto MR = AA.getModRefInfo(I, CI); + return isNoModRef(MR); + } + bool canMoveThroughBlock(CallInst *CI, BasicBlock *B, AliasAnalysis &AA) { + for (auto &I : *B) + if (!canMoveThrough(CI, &I, AA)) + return false; + return true; + } + + // This function gets the current BB of 'issue' and returns the + // next BB the 'issue' function can safely move to. + BasicBlock *findNextBBToCheckForMoving(CallInst *CI, BasicBlock *B, + AliasAnalysis &AA, DominatorTree &DT, + LoopInfo &LI) { + Loop *BLoop = LI.getLoopFor(B); + BasicBlock *DestBlock = B; + BasicBlock *DomBlock; + + while (1) { + auto Dom = DT.getNode(DestBlock)->getIDom(); + if (!Dom) + return B; + DomBlock = Dom->getBlock(); + Loop *DomLoop = LI.getLoopFor(DomBlock); + if (BLoop == DomLoop) + break; + DestBlock = DomBlock; + } + + SetVector PredecessorBB; + PredecessorBB.insert(B); + + for (unsigned int i = 0; i < PredecessorBB.size(); ++i) + for (BasicBlock *S : predecessors(PredecessorBB[i])) { + if (S == DomBlock) + continue; + if (!canMoveThroughBlock(CI, S, AA)) + return B; + PredecessorBB.insert(S); + } + + return DomBlock; + } + + bool moveWaitRTCInOrigBB(CallInst *IssueWrapperCall, CallInst *RTCallWait, + AliasAnalysis &AA) { + Instruction *I = RTCallWait; + Instruction *WaitMovePoint; + + while ((I = I->getNextNonDebugInstruction())) + if (!canMoveThrough(IssueWrapperCall, I, AA)) { + WaitMovePoint = I; + break; + } + if (!I) + WaitMovePoint = RTCallWait->getParent()->getTerminator(); + RTCallWait->moveBefore(WaitMovePoint); + return true; + } + + bool moveIssueRTCInBB(CallInst *IssueWrapperCall, BasicBlock *CurrentBB, + AliasAnalysis &AA) { + Instruction *IssuMovePoint; + bool IsOrigBB = (CurrentBB == IssueWrapperCall->getParent()); + Instruction *I; + if (IsOrigBB) + I = IssueWrapperCall; + else + I = CurrentBB->getTerminator(); + + while ((I = I->getPrevNonDebugInstruction())) + if (!canMoveThrough(IssueWrapperCall, I, AA)) { + IssuMovePoint = I; + break; + } + + // The function can move beyond its original BB, for the + // time being, move it to the begining of the BB. + if (!I && IsOrigBB) { + IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI()); + return false; + } + // insert issue in the very begining of the BB. + if (!I) { + IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI()); + return true; + } + IssueWrapperCall->moveAfter(IssuMovePoint); + return true; + } + + bool moveIssueAndWaitRTC(CallInst *IssueWrapperCall, CallInst *RTCallWait, + AliasAnalysis &AA, LoopInfo &LI, DominatorTree &DT) { + BasicBlock *IssueBB = IssueWrapperCall->getParent(); + moveWaitRTCInOrigBB(IssueWrapperCall, RTCallWait, AA); + // Move issue in original BB. + bool IssueMovedInOrigBB = moveIssueRTCInBB(IssueWrapperCall, IssueBB, AA); + // cannot move beyond its original bb + if (IssueMovedInOrigBB) + return true; + + BasicBlock *CurrentBB = IssueBB; + BasicBlock *NextBB; + while ((NextBB = findNextBBToCheckForMoving(IssueWrapperCall, CurrentBB, AA, + DT, LI))) { + // it cannot move anymore + if (NextBB == CurrentBB) + break; + CurrentBB = NextBB; + } + moveIssueRTCInBB(IssueWrapperCall, CurrentBB, AA); + + return true; + } + /// Tries to hide the latency of runtime calls that involve host to /// device memory transfers by splitting them into their "issue" and "wait" /// versions. The "issue" is moved upwards as much as possible. The "wait" is - /// moved downards as much as possible. The "issue" issues the memory transfer - /// asynchronously, returning a handle. The "wait" waits in the returned - /// handle for the memory transfer to finish. - bool hideMemTransfersLatency() { - auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; + /// moved downards as much as possible in its original BB. The "issue" issues + /// the memory transfer asynchronously, returning a handle. The "wait" waits + /// in the returned handle for the memory transfer to finish. + bool hideMemTransfersLatencyHelper(Function &F) { + + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; bool Changed = false; - auto SplitMemTransfers = [&](Use &U, Function &Decl) { + auto AsyncMemTransfers = [&](Use &U, Function &Decl) { auto *RTCall = getCallIfRegularCall(U, &RFI); if (!RTCall) return false; + Changed = true; - OffloadArray OffloadArrays[3]; - if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) - return false; + std::pair IssueWaitPair; + splitMapperToIssueAndWait(RTCall, IssueWaitPair); + CallInst *RTCallIssue = IssueWaitPair.first; + CallInst *RTCallWait = IssueWaitPair.second; + Function *IssueWrapper = outlineMapperIssueRT(*RTCallIssue); + assert(IssueWrapper && "Incomplete outlining in latency hiding"); + + // there is a one to one relation between RFI and IssueWrapper + CallInst *IssueWrapperCall = + cast(*(IssueWrapper->users().begin())); + assert(IssueWrapperCall && "Incomplete outlining in latency hiding"); - LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); + AliasAnalysis *AA = + OMPInfoCache.getAnalysisResultForFunction(F); + LoopInfo *LI = OMPInfoCache.getAnalysisResultForFunction(F); + DominatorTree *DT = + OMPInfoCache.getAnalysisResultForFunction(F); - // TODO: Check if can be moved upwards. - bool WasSplit = false; - Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); - if (WaitMovementPoint) - WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); + moveIssueAndWaitRTC(IssueWrapperCall, RTCallWait, *AA, *LI, *DT); - Changed |= WasSplit; - return WasSplit; + return true; }; - RFI.foreachUse(SCC, SplitMemTransfers); + RFI.foreachUse(AsyncMemTransfers, &F); return Changed; } + bool hideMemTransfersLatency() { + for (auto &F : M) + hideMemTransfersLatencyHelper(F); + return true; + } + /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels. /// TODO: Make this an AA and expand it to work across blocks and functions. bool eliminateBarriers() { @@ -1719,81 +1997,6 @@ LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); } - /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be - /// moved. Returns nullptr if the movement is not possible, or not worth it. - Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { - // FIXME: This traverses only the BasicBlock where RuntimeCall is. - // Make it traverse the CFG. - - Instruction *CurrentI = &RuntimeCall; - bool IsWorthIt = false; - while ((CurrentI = CurrentI->getNextNode())) { - - // TODO: Once we detect the regions to be offloaded we should use the - // alias analysis manager to check if CurrentI may modify one of - // the offloaded regions. - if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { - if (IsWorthIt) - return CurrentI; - - return nullptr; - } - - // FIXME: For now if we move it over anything without side effect - // is worth it. - IsWorthIt = true; - } - - // Return end of BasicBlock. - return RuntimeCall.getParent()->getTerminator(); - } - - /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. - bool splitTargetDataBeginRTC(CallInst &RuntimeCall, - Instruction &WaitMovementPoint) { - // Create stack allocated handle (__tgt_async_info) at the beginning of the - // function. Used for storing information of the async transfer, allowing to - // wait on it later. - auto &IRBuilder = OMPInfoCache.OMPBuilder; - auto *F = RuntimeCall.getCaller(); - Instruction *FirstInst = &(F->getEntryBlock().front()); - AllocaInst *Handle = new AllocaInst( - IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); - - // Add "issue" runtime call declaration: - // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, - // i8**, i8**, i64*, i64*) - FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___tgt_target_data_begin_mapper_issue); - - // Change RuntimeCall call site for its asynchronous version. - SmallVector Args; - for (auto &Arg : RuntimeCall.args()) - Args.push_back(Arg.get()); - Args.push_back(Handle); - - CallInst *IssueCallsite = - CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); - OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite); - RuntimeCall.eraseFromParent(); - - // Add "wait" runtime call declaration: - // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) - FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___tgt_target_data_begin_mapper_wait); - - Value *WaitParams[2] = { - IssueCallsite->getArgOperand( - OffloadArray::DeviceIDArgNum), // device_id. - Handle // handle to wait on. - }; - CallInst *WaitCallsite = CallInst::Create( - WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); - OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite); - - return true; - } - static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, bool GlobalOnly, bool &SingleChoice) { if (CurrentIdent == NextIdent) Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll =================================================================== --- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll +++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll @@ -1,528 +1,417 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -S -passes=openmp-opt-cgscc -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - -; CHECK: %struct.__tgt_async_info = type { i8* } - -%struct.ident_t = type { i32, i32, i32, i32, i8* } -%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } - -@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] -@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0 -@.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8] -@.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800] - -@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0 -@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -O3 -openmp-hide-memory-transfer-latency < %s | FileCheck %s -@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0 -@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0] -@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544] +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" -@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33] +%struct.ident_t = type { i32, i32, i32, i32, ptr } -@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8 -@.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@global = dso_local local_unnamed_addr global i32 0, align 4 +@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 1] +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 +@llvm.embedded.object = private constant [1 x i8] c"0", section ".llvm.offloading", align 8, !exclude !0 +@llvm.compiler.used = appending global [1 x ptr] [ptr @llvm.embedded.object], section "llvm.metadata" -;double heavyComputation1() { -; double a = rand() % 777; -; double random = rand(); -; -; //#pragma omp target data map(a) -; void* args[1]; -; args[0] = &a; -; __tgt_target_data_begin(..., args, ...) -; -; #pragma omp target teams -; for (int i = 0; i < 1000; ++i) { -; a *= i*i / 2; -; } -; -; return random + a; -;} -define dso_local double @heavyComputation1() { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation1() { +; Function Attrs: nounwind uwtable +define dso_local void @test1(i32 noundef %N) local_unnamed_addr #0 { +; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 -; CHECK-NEXT: [[A:%.*]] = alloca double, align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS5:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to i8* -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 777 -; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to double -; CHECK-NEXT: store double [[CONV]], double* [[A]], align 8 -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0:[0-9]+]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to i64* -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i64 0, i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]] to i64* -; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS5]], i64 0, i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS5]] to i64* -; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP10]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull [[TMP7]], i8** nonnull [[TMP9]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP11]], 0 -; CHECK-NEXT: br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]] -; CHECK: omp_offload.failed: -; CHECK-NEXT: call void @heavyComputation1FallBack(i64 [[TMP6]]) -; CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] -; CHECK: omp_offload.cont: -; CHECK-NEXT: [[CONV2:%.*]] = sitofp i32 [[CALL1]] to double -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[A]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP12]], [[CONV2]] -; CHECK-NEXT: ret double [[ADD]] -; - - - - - - -entry: - %a = alloca double, align 8 - %.offload_baseptrs = alloca [1 x i8*], align 8 - %.offload_ptrs = alloca [1 x i8*], align 8 - %.offload_baseptrs4 = alloca [1 x i8*], align 8 - %.offload_ptrs5 = alloca [1 x i8*], align 8 - - %0 = bitcast double* %a to i8* - %call = tail call i32 (...) @rand() - %rem = srem i32 %call, 777 - %conv = sitofp i32 %rem to double - store double %conv, double* %a, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here. - %call1 = tail call i32 (...) @rand() - - %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [1 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null) - - %5 = bitcast double* %a to i64* - %6 = load i64, i64* %5, align 8 - %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0 - %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64* - store i64 %6, i64* %8, align 8 - %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0 - %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64* - store i64 %6, i64* %10, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here. - %11 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) - %.not = icmp eq i32 %11, 0 - br i1 %.not, label %omp_offload.cont, label %omp_offload.failed - -omp_offload.failed: ; preds = %entry - call void @heavyComputation1FallBack(i64 %6) - br label %omp_offload.cont - -omp_offload.cont: ; preds = %omp_offload.failed, %entry - %conv2 = sitofp i32 %call1 to double - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null) - %12 = load double, double* %a, align 8 - %add = fadd double %12, %conv2 - ret double %add -} - -define internal void @heavyComputation1FallBack(i64 %a) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation1FallBack -; CHECK-SAME: (i64 [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void -; -entry: - ; Fallback for offloading function heavyComputation1. - ret void -} - -;int heavyComputation2(double* a, unsigned size) { -; int random = rand() % 7; -; -; //#pragma omp target data map(a[0:size], size) -; void* args[2]; -; args[0] = &a; -; args[1] = &size; -; __tgt_target_data_begin(..., args, ...) -; -; #pragma omp target teams -; for (int i = 0; i < size; ++i) { -; a[i] = ++a[i] * 3.141624; -; } -; -; return random; -;} -define dso_local i32 @heavyComputation2(double* %a, i32 %size) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation2 -; CHECK-SAME: (double* [[A:%.*]], i32 [[SIZE:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SIZE_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0 -; CHECK-NEXT: store i64 [[TMP0]], i64* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1 -; CHECK-NEXT: store i64 4, i64* [[TMP10]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP17]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP19]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0 -; CHECK-NEXT: br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]] -; CHECK: omp_offload.failed: -; CHECK-NEXT: call void @heavyComputation2FallBack(i64 [[SIZE_CASTED]], double* [[A]]) -; CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] -; CHECK: omp_offload.cont: -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 7 -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: ret i32 [[REM]] -; - - -entry: - %size.addr = alloca i32, align 4 - %.offload_baseptrs = alloca [2 x i8*], align 8 - %.offload_ptrs = alloca [2 x i8*], align 8 - %.offload_sizes = alloca [2 x i64], align 8 - %.offload_baseptrs2 = alloca [2 x i8*], align 8 - %.offload_ptrs3 = alloca [2 x i8*], align 8 - - store i32 %size, i32* %size.addr, align 4 - %call = tail call i32 (...) @rand() - - %conv = zext i32 %size to i64 - %0 = shl nuw nsw i64 %conv, 3 - %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [2 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 - store i64 %0, i64* %5, align 8 - %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 - %7 = bitcast i8** %6 to i32** - store i32* %size.addr, i32** %7, align 8 - %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 - %9 = bitcast i8** %8 to i32** - store i32* %size.addr, i32** %9, align 8 - %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 - store i64 4, i64* %10, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - - %11 = load i32, i32* %size.addr, align 4 - %size.casted = zext i32 %11 to i64 - %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 - %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64* - store i64 %size.casted, i64* %13, align 8 - %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0 - %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64* - store i64 %size.casted, i64* %15, align 8 - %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1 - %17 = bitcast i8** %16 to double** - store double* %a, double** %17, align 8 - %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 - %19 = bitcast i8** %18 to double** - store double* %a, double** %19, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here. - %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) - %.not = icmp eq i32 %20, 0 - br i1 %.not, label %omp_offload.cont, label %omp_offload.failed - -omp_offload.failed: ; preds = %entry - call void @heavyComputation2FallBack(i64 %size.casted, double* %a) - br label %omp_offload.cont - -omp_offload.cont: ; preds = %omp_offload.failed, %entry - %rem = srem i32 %call, 7 - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - ret i32 %rem -} - -define internal void @heavyComputation2FallBack(i64 %size, double* %a) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation2FallBack -; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) { -; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[CONV]], 2 +; CHECK-NEXT: [[CALL:%.*]] = tail call noalias ptr @malloc(i64 noundef [[MUL]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER1:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: for.body.preheader1: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[N_VEC:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4, !tbaa [[TBAA8:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER1]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: store ptr [[CALL]], ptr [[DOTOFFLOAD_BASEPTRS]], align 8 +; CHECK-NEXT: store ptr [[CALL]], ptr [[DOTOFFLOAD_PTRS]], align 8 +; CHECK-NEXT: store i64 [[MUL]], ptr [[DOTOFFLOAD_SIZES]], align 8 +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(ptr nonnull @[[GLOB1:[0-9]+]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null, ptr nonnull [[HANDLE]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @global, align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP2]], 1 +; CHECK-NEXT: store i32 [[INC2]], ptr @global, align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(ptr nonnull @[[GLOB1]], i64 -1, ptr nonnull [[HANDLE]]) #[[ATTR2]] +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @[[GLOB1]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null) ; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER1]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; entry: - ; Fallback for offloading function heavyComputation2. + %.offload_baseptrs = alloca [1 x ptr], align 8 + %.offload_ptrs = alloca [1 x ptr], align 8 + %.offload_sizes = alloca [1 x i64], align 8 + %conv = sext i32 %N to i64 + %mul = shl nsw i64 %conv, 2 + %call = tail call noalias ptr @malloc(i64 noundef %mul) #3 + %cmp8 = icmp sgt i32 %N, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + %min.iters.check = icmp ult i32 %N, 4 + br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 4294967292 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %call, i64 %index + store <4 x i32> %vec.ind, ptr %0, align 4, !tbaa !8 + %index.next = add nuw i64 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %1 = icmp eq i64 %index.next, %n.vec + br i1 %1, label %middle.block, label %vector.body, !llvm.loop !12 + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11 + +for.body.preheader11: ; preds = %for.body.preheader, %middle.block + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + %2 = load i32, ptr @global, align 4, !tbaa !8 + %inc2 = add nsw i32 %2, 1 + store i32 %inc2, ptr @global, align 4, !tbaa !8 + store ptr %call, ptr %.offload_baseptrs, align 8 + store ptr %call, ptr %.offload_ptrs, align 8 + store i64 %mul, ptr %.offload_sizes, align 8 + call void @__tgt_target_data_begin_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null) + call void @__tgt_target_data_end_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null) ret void -} -;int heavyComputation3(double* restrict a, unsigned size) { -; int random = rand() % 7; -; -; //#pragma omp target data map(a[0:size], size) -; void* args[2]; -; args[0] = &a; -; args[1] = &size; -; __tgt_target_data_begin(..., args, ...) -; -; #pragma omp target teams -; for (int i = 0; i < size; ++i) { -; a[i] = ++a[i] * 3.141624; -; } -; -; return random; -;} -define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation3 -; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SIZE_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0 -; CHECK-NEXT: store i64 [[TMP0]], i64* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1 -; CHECK-NEXT: store i64 4, i64* [[TMP10]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP17]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP19]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0 -; CHECK-NEXT: br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]] -; CHECK: omp_offload.failed: -; CHECK-NEXT: call void @heavyComputation3FallBack(i64 [[SIZE_CASTED]], double* [[A]]) -; CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] -; CHECK: omp_offload.cont: -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 7 -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: ret i32 [[REM]] -; - - -entry: - %size.addr = alloca i32, align 4 - %.offload_baseptrs = alloca [2 x i8*], align 8 - %.offload_ptrs = alloca [2 x i8*], align 8 - %.offload_sizes = alloca [2 x i64], align 8 - %.offload_baseptrs2 = alloca [2 x i8*], align 8 - %.offload_ptrs3 = alloca [2 x i8*], align 8 - store i32 %size, i32* %size.addr, align 4 - - ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here. - %call = tail call i32 (...) @rand() - - %conv = zext i32 %size to i64 - %0 = shl nuw nsw i64 %conv, 3 - %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [2 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 - store i64 %0, i64* %5, align 8 - %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 - %7 = bitcast i8** %6 to i32** - store i32* %size.addr, i32** %7, align 8 - %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 - %9 = bitcast i8** %8 to i32** - store i32* %size.addr, i32** %9, align 8 - %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 - store i64 4, i64* %10, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - - %11 = load i32, i32* %size.addr, align 4 - %size.casted = zext i32 %11 to i64 - %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 - %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64* - store i64 %size.casted, i64* %13, align 8 - %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0 - %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64* - store i64 %size.casted, i64* %15, align 8 - %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1 - %17 = bitcast i8** %16 to double** - store double* %a, double** %17, align 8 - %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 - %19 = bitcast i8** %18 to double** - store double* %a, double** %19, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here. - %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) - %.not = icmp eq i32 %20, 0 - br i1 %.not, label %omp_offload.cont, label %omp_offload.failed - -omp_offload.failed: ; preds = %entry - call void @heavyComputation3FallBack(i64 %size.casted, double* %a) - br label %omp_offload.cont - -omp_offload.cont: ; preds = %omp_offload.failed, %entry - %rem = srem i32 %call, 7 - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - ret i32 %rem +for.body: ; preds = %for.body.preheader11, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader11 ] + %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv + %3 = trunc i64 %indvars.iv to i32 + store i32 %3, ptr %arrayidx, align 4, !tbaa !8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !16 } -define internal void @heavyComputation3FallBack(i64 %size, double* %a) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation3FallBack -; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void -; -entry: - ; Fallback for offloading function heavyComputation3. - ret void -} -;int dataTransferOnly1(double* restrict a, unsigned size) { -; // Random computation. -; int random = rand(); -; -; //#pragma omp target data map(to:a[0:size]) -; void* args[1]; -; args[0] = &a; -; __tgt_target_data_begin(..., args, ...) -; -; // Random computation. -; random %= size; -; return random; -;} -define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) { -; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1 -; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) { +define dso_local void @test2(i32 noundef %N) local_unnamed_addr #0 { +; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 ; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0 -; CHECK-NEXT: store i64 [[TMP0]], i64* [[TMP5]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: [[REM:%.*]] = urem i32 [[CALL]], [[SIZE]] -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: ret i32 [[REM]] +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[CONV]], 2 +; CHECK-NEXT: [[CALL:%.*]] = tail call noalias ptr @malloc(i64 noundef [[MUL]]) #[[ATTR3]] +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP6:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER2:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: for.body.preheader2: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[N_VEC:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDEX]] +; CHECK-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER2]] +; CHECK: for.cond8.preheader.us.us.preheader: +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[N]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i33 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[N]], -2 +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i33 +; CHECK-NEXT: [[TMP6:%.*]] = mul i33 [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = lshr i33 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i33 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[N]], 1001 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP8]], 1000 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[N]] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], [[TMP12]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER1:%.*]], label [[VECTOR_PH45:%.*]] +; CHECK: vector.ph45: +; CHECK-NEXT: [[N_VEC47:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP16]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY50:%.*]] +; CHECK: vector.body50: +; CHECK-NEXT: [[INDEX51:%.*]] = phi i64 [ 0, [[VECTOR_PH45]] ], [ [[INDEX_NEXT52:%.*]], [[VECTOR_BODY50]] ] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDEX51]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[TMP18:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x i32> [[TMP18]], ptr [[TMP17]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INDEX_NEXT52]] = add nuw nsw i64 [[INDEX51]], 4 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT52]], [[N_VEC47]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK42:%.*]], label [[VECTOR_BODY50]], !llvm.loop [[LOOP16]] +; CHECK: middle.block42: +; CHECK-NEXT: [[CMP_N49:%.*]] = icmp eq i64 [[N_VEC47]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N49]], label [[FOR_COND_CLEANUP6]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER1]] +; CHECK: for.cond8.preheader.us.us.preheader1: +; CHECK-NEXT: [[INDVARS_IV37_PH:%.*]] = phi i64 [ [[N_VEC47]], [[MIDDLE_BLOCK42]] ], [ 0, [[FOR_COND8_PREHEADER_US_US_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_COND8_PREHEADER_US_US:%.*]] +; CHECK: for.cond8.preheader.us.us: +; CHECK-NEXT: [[INDVARS_IV37:%.*]] = phi i64 [ [[INDVARS_IV_NEXT38:%.*]], [[FOR_COND8_PREHEADER_US_US]] ], [ [[INDVARS_IV37_PH]], [[FOR_COND8_PREHEADER_US_US_PREHEADER1]] ] +; CHECK-NEXT: [[ARRAYIDX20_US_US:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDVARS_IV37]] +; CHECK-NEXT: [[ARRAYIDX20_PROMOTED_US_US:%.*]] = load i32, ptr [[ARRAYIDX20_US_US]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[ARRAYIDX20_PROMOTED_US_US]], [[TMP16]] +; CHECK-NEXT: store i32 [[TMP20]], ptr [[ARRAYIDX20_US_US]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INDVARS_IV_NEXT38]] = add nuw nsw i64 [[INDVARS_IV37]], 1 +; CHECK-NEXT: [[EXITCOND41_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT38]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND41_NOT]], label [[FOR_COND_CLEANUP6]], label [[FOR_COND8_PREHEADER_US_US]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER2]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: store i32 [[TMP21]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND8_PREHEADER_US_US_PREHEADER]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: for.cond.cleanup6: +; CHECK-NEXT: store ptr [[CALL]], ptr [[DOTOFFLOAD_BASEPTRS]], align 8 +; CHECK-NEXT: store ptr [[CALL]], ptr [[DOTOFFLOAD_PTRS]], align 8 +; CHECK-NEXT: store i64 [[MUL]], ptr [[DOTOFFLOAD_SIZES]], align 8 +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(ptr nonnull @[[GLOB1]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null, ptr nonnull [[HANDLE]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr @global, align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: [[INC31:%.*]] = add nsw i32 [[TMP22]], 1 +; CHECK-NEXT: [[CMP32:%.*]] = icmp sgt i32 [[TMP22]], 0 +; CHECK-NEXT: br i1 [[CMP32]], label [[IF_THEN:%.*]], label [[IF_END44:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[INC34:%.*]] = add nuw nsw i32 [[TMP22]], 2 +; CHECK-NEXT: [[CMP35:%.*]] = icmp ugt i32 [[TMP22]], 4 +; CHECK-NEXT: br i1 [[CMP35]], label [[IF_THEN37:%.*]], label [[IF_END44]] +; CHECK: if.then37: +; CHECK-NEXT: [[CMP39:%.*]] = icmp ult i32 [[TMP22]], 8 +; CHECK-NEXT: [[SPEC_SELECT_V:%.*]] = select i1 [[CMP39]], i32 4, i32 3 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = add nuw nsw i32 [[SPEC_SELECT_V]], [[TMP22]] +; CHECK-NEXT: br label [[IF_END44]] +; CHECK: if.end44: +; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ [[INC34]], [[IF_THEN]] ], [ [[INC31]], [[FOR_COND_CLEANUP6]] ], [ [[SPEC_SELECT]], [[IF_THEN37]] ] +; CHECK-NEXT: [[INC45:%.*]] = add nsw i32 [[TMP23]], 1 +; CHECK-NEXT: store i32 [[INC45]], ptr @global, align 4, !tbaa [[TBAA8]] +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(ptr nonnull @[[GLOB1]], i64 -1, ptr nonnull [[HANDLE]]) #[[ATTR2]] +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @[[GLOB1]], i64 -1, i32 1, ptr nonnull [[DOTOFFLOAD_BASEPTRS]], ptr nonnull [[DOTOFFLOAD_PTRS]], ptr nonnull [[DOTOFFLOAD_SIZES]], ptr nonnull @.offload_maptypes, ptr null, ptr null) +; CHECK-NEXT: ret void ; - - - - - - entry: - %.offload_baseptrs = alloca [1 x i8*], align 8 - %.offload_ptrs = alloca [1 x i8*], align 8 + %.offload_baseptrs = alloca [1 x ptr], align 8 + %.offload_ptrs = alloca [1 x ptr], align 8 %.offload_sizes = alloca [1 x i64], align 8 - - ; FIXME: call to @__tgt_target_data_begin_issue_mapper(...) should be moved here. - %call = tail call i32 (...) @rand() - - %conv = zext i32 %size to i64 - %0 = shl nuw nsw i64 %conv, 3 - %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [1 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0 - store i64 %0, i64* %5, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null) - - %rem = urem i32 %call, %size - - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null) - ret i32 %rem + %conv = sext i32 %N to i64 + %mul = shl nsw i64 %conv, 2 + %call = tail call noalias ptr @malloc(i64 noundef %mul) #3 + %cmp18 = icmp sgt i32 %N, 0 + br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup6 + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + %min.iters.check = icmp ult i32 %N, 4 + br i1 %min.iters.check, label %for.body.preheader54, label %vector.ph + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 4294967292 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %call, i64 %index + store <4 x i32> %vec.ind, ptr %0, align 4, !tbaa !8 + %index.next = add nuw i64 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %1 = icmp eq i64 %index.next, %n.vec + br i1 %1, label %middle.block, label %vector.body, !llvm.loop !12 + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond3.preheader, label %for.body.preheader54 + +for.body.preheader54: ; preds = %for.body.preheader, %middle.block + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + br label %for.body + +for.cond3.preheader: ; preds = %for.body, %middle.block + br i1 %cmp18, label %for.cond8.preheader.us.us.preheader, label %for.cond.cleanup6 + +for.cond8.preheader.us.us.preheader: ; preds = %for.cond3.preheader + %2 = add nsw i32 %N, -1 + %3 = zext i32 %2 to i33 + %4 = add nsw i32 %N, -2 + %5 = zext i32 %4 to i33 + %6 = mul i33 %3, %5 + %7 = lshr i33 %6, 1 + %8 = trunc i33 %7 to i32 + %9 = mul i32 %N, 1001 + %10 = add i32 %9, %8 + %11 = add i32 %10, -1 + %12 = mul i32 %11, %2 + %13 = add i32 %8, 1000 + %14 = mul i32 %13, %N + %15 = add i32 %12, %14 + %16 = add i32 %15, %8 + %17 = add i32 %16, -1 + %wide.trip.count40 = zext i32 %N to i64 + %min.iters.check44 = icmp ult i32 %N, 4 + br i1 %min.iters.check44, label %for.cond8.preheader.us.us.preheader53, label %vector.ph45 + +vector.ph45: ; preds = %for.cond8.preheader.us.us.preheader + %n.vec47 = and i64 %wide.trip.count, 4294967292 + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %17, i64 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body50 + +vector.body50: ; preds = %vector.body50, %vector.ph45 + %index51 = phi i64 [ 0, %vector.ph45 ], [ %index.next52, %vector.body50 ] + %18 = getelementptr inbounds i32, ptr %call, i64 %index51 + %wide.load = load <4 x i32>, ptr %18, align 4, !tbaa !8 + %19 = add <4 x i32> %broadcast.splat, %wide.load + store <4 x i32> %19, ptr %18, align 4, !tbaa !8 + %index.next52 = add nuw i64 %index51, 4 + %20 = icmp eq i64 %index.next52, %n.vec47 + br i1 %20, label %middle.block42, label %vector.body50, !llvm.loop !16 + +middle.block42: ; preds = %vector.body50 + %cmp.n49 = icmp eq i64 %n.vec47, %wide.trip.count + br i1 %cmp.n49, label %for.cond.cleanup6, label %for.cond8.preheader.us.us.preheader53 + +for.cond8.preheader.us.us.preheader53: ; preds = %for.cond8.preheader.us.us.preheader, %middle.block42 + %indvars.iv37.ph = phi i64 [ 0, %for.cond8.preheader.us.us.preheader ], [ %n.vec47, %middle.block42 ] + br label %for.cond8.preheader.us.us + +for.cond8.preheader.us.us: ; preds = %for.cond8.preheader.us.us.preheader53, %for.cond8.preheader.us.us + %indvars.iv37 = phi i64 [ %indvars.iv.next38, %for.cond8.preheader.us.us ], [ %indvars.iv37.ph, %for.cond8.preheader.us.us.preheader53 ] + %arrayidx20.us.us = getelementptr inbounds i32, ptr %call, i64 %indvars.iv37 + %arrayidx20.promoted.us.us = load i32, ptr %arrayidx20.us.us, align 4, !tbaa !8 + %21 = add i32 %17, %arrayidx20.promoted.us.us + store i32 %21, ptr %arrayidx20.us.us, align 4, !tbaa !8 + %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1 + %exitcond41.not = icmp eq i64 %indvars.iv.next38, %wide.trip.count40 + br i1 %exitcond41.not, label %for.cond.cleanup6, label %for.cond8.preheader.us.us, !llvm.loop !17 + +for.body: ; preds = %for.body.preheader54, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader54 ] + %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv + %22 = trunc i64 %indvars.iv to i32 + store i32 %22, ptr %arrayidx, align 4, !tbaa !8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond3.preheader, label %for.body, !llvm.loop !18 + +for.cond.cleanup6: ; preds = %for.cond8.preheader.us.us, %middle.block42, %entry, %for.cond3.preheader + %23 = load i32, ptr @global, align 4, !tbaa !8 + %inc31 = add nsw i32 %23, 1 + %cmp32 = icmp sgt i32 %23, 0 + br i1 %cmp32, label %if.then, label %if.end44 + +if.then: ; preds = %for.cond.cleanup6 + %inc34 = add nuw nsw i32 %23, 2 + %cmp35 = icmp ugt i32 %23, 4 + br i1 %cmp35, label %if.then37, label %if.end44 + +if.then37: ; preds = %if.then + %cmp39 = icmp ult i32 %23, 8 + %spec.select.v = select i1 %cmp39, i32 4, i32 3 + %spec.select = add nuw nsw i32 %23, %spec.select.v + br label %if.end44 + +if.end44: ; preds = %if.then37, %if.then, %for.cond.cleanup6 + %24 = phi i32 [ %inc34, %if.then ], [ %inc31, %for.cond.cleanup6 ], [ %spec.select, %if.then37 ] + %inc45 = add nsw i32 %24, 1 + store i32 %inc45, ptr @global, align 4, !tbaa !8 + store ptr %call, ptr %.offload_baseptrs, align 8 + store ptr %call, ptr %.offload_ptrs, align 8 + store i64 %mul, ptr %.offload_sizes, align 8 + call void @__tgt_target_data_begin_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null) + call void @__tgt_target_data_end_mapper(ptr nonnull @1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes, ptr null, ptr null) + ret void } -declare void @__tgt_target_data_begin_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**) -declare i32 @__tgt_target_teams_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**, i32, i32) -declare void @__tgt_target_data_end_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**) - -declare dso_local i32 @rand(...) -; CHECK: declare void @__tgt_target_data_begin_mapper_issue(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**, %struct.__tgt_async_info*) -; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info*) -!llvm.module.flags = !{!0} +; Function Attrs: inaccessiblememonly mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) +declare noalias noundef ptr @malloc(i64 noundef) local_unnamed_addr #1 + +; Function Attrs: nounwind +declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) local_unnamed_addr #2 + +; Function Attrs: nounwind +declare void @__tgt_target_data_end_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) local_unnamed_addr #2 + +attributes #0 = { nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { inaccessiblememonly mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) "alloc-family"="malloc" "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { nounwind } +attributes #3 = { nounwind allocsize(0) } + +!llvm.module.flags = !{!1, !2, !3, !4, !5} +!llvm.ident = !{!6} +!llvm.embedded.objects = !{!7} +!nvvm.annotations = !{} + +!0 = !{} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"openmp", i32 50} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git cf1d9a1fdca258ab56f3060dfa4a303b8127350e)"} +!7 = !{ptr @llvm.embedded.object, !".llvm.offloading"} +!8 = !{!9, !9, i64 0} +!9 = !{!"int", !10, i64 0} +!10 = !{!"omnipotent char", !11, i64 0} +!11 = !{!"Simple C/C++ TBAA"} +!12 = distinct !{!12, !13, !14, !15} +!13 = !{!"llvm.loop.mustprogress"} +!14 = !{!"llvm.loop.unroll.disable"} +!15 = !{!"llvm.loop.isvectorized", i32 1} +!16 = distinct !{!16, !13, !14, !15} +!17 = distinct !{!17, !13, !14, !15} +!18 = distinct !{!18, !13, !14, !15} -!0 = !{i32 7, !"openmp", i32 50}