Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/MemoryLocation.h" @@ -49,6 +50,7 @@ #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include @@ -816,6 +818,9 @@ // TODO: This should be folded into buildCustomStateMachine. Changed |= rewriteDeviceCodeStateMachine(); + if (HideMemoryTransferLatency) + Changed |= hideMemTransfersLatency(); + if (remarksEnabled()) analysisGlobalization(); @@ -833,8 +838,6 @@ Changed |= deleteParallelRegions(); - if (HideMemoryTransferLatency) - Changed |= hideMemTransfersLatency(); Changed |= deduplicateRuntimeCalls(); if (EnableParallelRegionMerging) { if (mergeParallelRegions()) { @@ -1375,40 +1378,380 @@ return Changed; } + bool + splitMapperToIssueAndWait(CallInst *RuntimeCall, + std::pair &IssueWaitPair) { + auto &IRBuilder = OMPInfoCache.OMPBuilder; + Function *F = RuntimeCall->getCaller(); + const DataLayout &DL = M.getDataLayout(); + Instruction *FirstInst = &(F->getEntryBlock().front()); + AllocaInst *Handle = new AllocaInst( + IRBuilder.AsyncInfo, DL.getAllocaAddrSpace(), "handle", FirstInst); + + FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper_issue); + + // Change RuntimeCall call site for its asynchronous version. + SmallVector Args; + for (auto &Arg : RuntimeCall->args()) + Args.push_back(Arg.get()); + Args.push_back(Handle); + + auto IssueCall = + CallInst::Create(IssueDecl, Args, /*NameStr=*/"", RuntimeCall); + assert(IssueCall && "Incomplete spliting"); + IssueWaitPair.first = IssueCall; + + FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper_wait); + + Value *WaitParams[3] = { + RuntimeCall->getArgOperand(0), + RuntimeCall->getArgOperand(OffloadArray::DeviceIDArgNum), Handle}; + auto WaitCall = CallInst::Create(WaitDecl, WaitParams, "", RuntimeCall); + assert(WaitCall && "Incomplete spliting"); + IssueWaitPair.second = WaitCall; + + RuntimeCall->eraseFromParent(); + + return true; + } + + bool getUseTreeDuplicated( + SmallVector> &UseTreeDuplicated, + Instruction *I, Instruction *MoveI) { + UseTreeDuplicated.push_back({I, nullptr}); + + for (unsigned int i = 0; i < UseTreeDuplicated.size(); ++i) + for (User *U : dyn_cast(UseTreeDuplicated[i].first)->users()) { + if (dyn_cast(U)) + continue; + + assert((dyn_cast(U) || dyn_cast(U) || + dyn_cast(U)) && + "Problem in data mapping"); + + auto UInstr = cast(U); + + if (isSafeToSpeculativelyExecute(UInstr)) + UseTreeDuplicated.push_back({UInstr, UInstr->clone()}); + else + UseTreeDuplicated.push_back({UInstr, nullptr}); + } + + SetVector OperandSet; + for (auto Utd : UseTreeDuplicated) + OperandSet.insert(Utd.first); + + for (unsigned int i = 0; i < OperandSet.size(); ++i) { + auto Ops = OperandSet[i]; + for (auto &Op : Ops->operands()) { + if (!dyn_cast(Op)) + continue; + if (!dyn_cast(Op) && !dyn_cast(Op) && + !dyn_cast(Op) && !dyn_cast(Op) && + !dyn_cast(Op)) + continue; + if (OperandSet.contains(dyn_cast(Op))) + continue; + OperandSet.insert(dyn_cast(Op)); + if (isSafeToSpeculativelyExecute(dyn_cast(Op))) + UseTreeDuplicated.push_back( + {dyn_cast(Op), dyn_cast(Op)->clone()}); + else + UseTreeDuplicated.push_back({dyn_cast(Op), nullptr}); + } + } + + SmallSet IndexSet; + SmallSet InsertSet; + unsigned int VecSize = UseTreeDuplicated.size(); + + while (IndexSet.size() < VecSize - 1) + for (unsigned int i = 1; i < VecSize; ++i) { + if (IndexSet.contains(i)) + continue; + bool InsertFlag = true; + for (auto &Op : UseTreeDuplicated[i].first->operands()) { + if (!dyn_cast(Op)) + continue; + // if its operand is supposed to be inserted, but it is not there yet + if ((cast(Op) != I) && + OperandSet.contains(cast(Op)) && + !InsertSet.contains(cast(Op))) { + InsertFlag = false; + break; + } + } + // cannot be inserted yet + if (!InsertFlag) + continue; + InsertSet.insert(cast(UseTreeDuplicated[i].first)); + IndexSet.insert(i); + + // insert the duplicate if it exists + if (UseTreeDuplicated[i].second) + UseTreeDuplicated[i].second->insertBefore(UseTreeDuplicated[i].first); + + // move the instruction to the right BB + UseTreeDuplicated[i].first->moveBefore(MoveI); + } + + return true; + } + + Function *outlineMapperIssueRT(CallInst &RuntimeCall) { + BasicBlock *RuntimeCallBB = RuntimeCall.getParent(); + Value *BasePtrsArg = + RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); + // %offload_ptrs. + Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); + // %offload_sizes. + Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); + auto *BasePtrsArray = + dyn_cast(getUnderlyingObject(BasePtrsArg)); + auto *PtrsArray = dyn_cast(getUnderlyingObject(PtrsArg)); + auto *SizesArray = dyn_cast(getUnderlyingObject(SizesArg)); + if (!BasePtrsArray || !PtrsArray) + return nullptr; + + // split bb to be outlined + BasicBlock *MapperBB = + RuntimeCallBB->splitBasicBlock(&RuntimeCall, "mapper.bb"); + BasicBlock *NextBB = + MapperBB->splitBasicBlock(RuntimeCall.getNextNode(), "next.bb"); + SmallVector> BasePtrsUses; + SmallVector> PtrsUses; + + getUseTreeDuplicated(BasePtrsUses, BasePtrsArray, (&RuntimeCall)); + getUseTreeDuplicated(PtrsUses, PtrsArray, (&RuntimeCall)); + + for (auto Bpu : BasePtrsUses) + if (Bpu.second) + Bpu.first->replaceUsesOutsideBlock(Bpu.second, MapperBB); + for (auto Pu : PtrsUses) + if (Pu.second) + Pu.first->replaceUsesOutsideBlock(Pu.second, MapperBB); + + // For cases with constant array size. + if (SizesArray) { + SmallVector> SizesArrayUses; + getUseTreeDuplicated(SizesArrayUses, SizesArray, (&RuntimeCall)); + for (auto Sau : SizesArrayUses) + if (Sau.second) + Sau.first->replaceUsesOutsideBlock(Sau.second, MapperBB); + } + SmallVector ExtractBB{MapperBB}; + CodeExtractor CE(ExtractBB); + CodeExtractorAnalysisCache CEAC(*(MapperBB->getParent())); + Function *OutlinedFunc = CE.extractCodeRegion(CEAC); + if (!OutlinedFunc) + return nullptr; + + OutlinedFunc->setName("__openmp_mapper_issue_wrapper_" + + MapperBB->getParent()->getName()); + + MergeBlockIntoPredecessor(NextBB); + // Annotating the outlined function + OutlinedFunc->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + int i = 0; + for (auto &A : OutlinedFunc->args()) { + if (A.getType()->isPointerTy()) + OutlinedFunc->addParamAttr(i, Attribute::NoCapture); + i++; + } + return OutlinedFunc; + } + // returns true if \p CI can move before or after \p I + bool canMoveThrough(CallInst *CI, Instruction *I, AliasAnalysis &AA, + SmallSet &CIArgs) { + if (CIArgs.contains(I)) + return false; + if (!(I->mayHaveSideEffects())) + return true; + auto MR = AA.getModRefInfo(I, CI); + return isNoModRef(MR); + } + + bool canMoveThroughBlock(CallInst *CI, BasicBlock *B, AliasAnalysis &AA, + SmallSet &CIArgs) { + for (auto &I : *B) + if (!canMoveThrough(CI, &I, AA, CIArgs)) + return false; + return true; + } + + // This function gets the current BB of 'issue' and returns the + // next BB the 'issue' function can safely move to. + BasicBlock *findNextBBToCheckForMoving(CallInst *CI, BasicBlock *B, + AliasAnalysis &AA, DominatorTree &DT, + PostDominatorTree &PDT, LoopInfo &LI, + SmallSet &CIArgs) { + Loop *BLoop = LI.getLoopFor(B); + BasicBlock *DestBlock = B; + BasicBlock *DomBlock; + + while (1) { + auto Dom = DT.getNode(DestBlock)->getIDom(); + if (!Dom) + return B; + DomBlock = Dom->getBlock(); + Loop *DomLoop = LI.getLoopFor(DomBlock); + if (BLoop == DomLoop) + break; + DestBlock = DomBlock; + } + SetVector PredecessorBB; + PredecessorBB.insert(B); + + for (unsigned int i = 0; i < PredecessorBB.size(); ++i) + for (BasicBlock *S : predecessors(PredecessorBB[i])) { + if (S == DomBlock) { + continue; + } + if (!canMoveThroughBlock(CI, S, AA, CIArgs)) + return B; + PredecessorBB.insert(S); + } + + bool PostDomination = PDT.dominates(B, DomBlock); + + if (PostDomination) + return DomBlock; + return B; + } + + bool moveWaitRTCInOrigBB(CallInst *IssueWrapperCall, CallInst *RTCallWait, + AliasAnalysis &AA, SmallSet &CIArgs) { + Instruction *I = RTCallWait; + Instruction *WaitMovePoint; + + while ((I = I->getNextNonDebugInstruction())) + if (!canMoveThrough(IssueWrapperCall, I, AA, CIArgs)) { + WaitMovePoint = I; + break; + } + if (!I) + WaitMovePoint = RTCallWait->getParent()->getTerminator(); + RTCallWait->moveBefore(WaitMovePoint); + return true; + } + + bool moveIssueRTCInBB(CallInst *IssueWrapperCall, BasicBlock *CurrentBB, + AliasAnalysis &AA, SmallSet &IssueCallArgs, + bool IsOrigBB = 0) { + Instruction *IssuMovePoint; + Instruction *I; + if (CurrentBB == IssueWrapperCall->getParent()) + I = IssueWrapperCall; + else + I = CurrentBB->getTerminator(); + + while ((I = I->getPrevNonDebugInstruction())) + if (!canMoveThrough(IssueWrapperCall, I, AA, IssueCallArgs)) { + IssuMovePoint = I; + break; + } + + // The function can move beyond its original BB, for the + // time being, move it to the begining of the BB. + if (!I && IsOrigBB) { + IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI()); + return false; + } + // insert issue in the very begining of the BB. + if (!I) { + IssueWrapperCall->moveBefore(CurrentBB->getFirstNonPHI()); + return true; + } + IssueWrapperCall->moveAfter(IssuMovePoint); + return true; + } + + bool moveIssueAndWaitRTC(CallInst *IssueWrapperCall, CallInst *RTCallWait, + AliasAnalysis &AA, LoopInfo &LI, DominatorTree &DT, + PostDominatorTree &PDT, BasicBlock *IssueBB) { + + SmallSet IssueCallArgs; + for (auto &A : IssueWrapperCall->args()) + IssueCallArgs.insert(A); + + moveWaitRTCInOrigBB(IssueWrapperCall, RTCallWait, AA, IssueCallArgs); + bool IssueMovedInOrigBB = + moveIssueRTCInBB(IssueWrapperCall, IssueBB, AA, IssueCallArgs, 1); + if (IssueMovedInOrigBB) + return true; + + BasicBlock *CurrentBB = IssueBB; + BasicBlock *NextBB; + while ((NextBB = findNextBBToCheckForMoving(IssueWrapperCall, CurrentBB, AA, + DT, PDT, LI, IssueCallArgs))) { + // it cannot move anymore + if (NextBB == CurrentBB) + break; + CurrentBB = NextBB; + } + if (CurrentBB == IssueBB) + moveIssueRTCInBB(IssueWrapperCall, CurrentBB, AA, IssueCallArgs, 1); + else + moveIssueRTCInBB(IssueWrapperCall, CurrentBB, AA, IssueCallArgs); + return true; + } + /// Tries to hide the latency of runtime calls that involve host to /// device memory transfers by splitting them into their "issue" and "wait" /// versions. The "issue" is moved upwards as much as possible. The "wait" is - /// moved downards as much as possible. The "issue" issues the memory transfer - /// asynchronously, returning a handle. The "wait" waits in the returned - /// handle for the memory transfer to finish. - bool hideMemTransfersLatency() { - auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; + /// moved downards as much as possible in its original BB. The "issue" issues + /// the memory transfer asynchronously, returning a handle. The "wait" waits + /// in the returned handle for the memory transfer to finish. + bool hideMemTransfersLatencyHelper(Function &F) { + + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; bool Changed = false; - auto SplitMemTransfers = [&](Use &U, Function &Decl) { + auto AsyncMemTransfers = [&](Use &U, Function &Decl) { auto *RTCall = getCallIfRegularCall(U, &RFI); if (!RTCall) return false; + Changed = true; - OffloadArray OffloadArrays[3]; - if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) - return false; - - LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); - - // TODO: Check if can be moved upwards. - bool WasSplit = false; - Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); - if (WaitMovementPoint) - WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); + std::pair IssueWaitPair; + splitMapperToIssueAndWait(RTCall, IssueWaitPair); + CallInst *RTCallIssue = IssueWaitPair.first; + CallInst *RTCallWait = IssueWaitPair.second; + BasicBlock *IssueBB = RTCallIssue->getParent(); + Function *IssueWrapper = outlineMapperIssueRT(*RTCallIssue); + assert(IssueWrapper && "Incomplete outlining in latency hiding"); + // there is a one to one relation between RFI and IssueWrapper + CallInst *IssueWrapperCall = + cast(*(IssueWrapper->users().begin())); + assert(IssueWrapperCall && "Incomplete outlining in latency hiding"); + + AliasAnalysis *AA = + OMPInfoCache.getAnalysisResultForFunction(F); + LoopInfo *LI = OMPInfoCache.getAnalysisResultForFunction(F); + DominatorTree *DT = + OMPInfoCache.getAnalysisResultForFunction(F); + PostDominatorTree *PDT = + OMPInfoCache.getAnalysisResultForFunction( + F); + + moveIssueAndWaitRTC(IssueWrapperCall, RTCallWait, *AA, *LI, *DT, *PDT, + IssueBB); - Changed |= WasSplit; - return WasSplit; + return true; }; - RFI.foreachUse(SCC, SplitMemTransfers); + RFI.foreachUse(AsyncMemTransfers, &F); return Changed; } + bool hideMemTransfersLatency() { + for (auto &F : M) + hideMemTransfersLatencyHelper(F); + return true; + } + /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels. /// TODO: Make this an AA and expand it to work across blocks and functions. bool eliminateBarriers() { @@ -1719,81 +2062,6 @@ LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); } - /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be - /// moved. Returns nullptr if the movement is not possible, or not worth it. - Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { - // FIXME: This traverses only the BasicBlock where RuntimeCall is. - // Make it traverse the CFG. - - Instruction *CurrentI = &RuntimeCall; - bool IsWorthIt = false; - while ((CurrentI = CurrentI->getNextNode())) { - - // TODO: Once we detect the regions to be offloaded we should use the - // alias analysis manager to check if CurrentI may modify one of - // the offloaded regions. - if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { - if (IsWorthIt) - return CurrentI; - - return nullptr; - } - - // FIXME: For now if we move it over anything without side effect - // is worth it. - IsWorthIt = true; - } - - // Return end of BasicBlock. - return RuntimeCall.getParent()->getTerminator(); - } - - /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. - bool splitTargetDataBeginRTC(CallInst &RuntimeCall, - Instruction &WaitMovementPoint) { - // Create stack allocated handle (__tgt_async_info) at the beginning of the - // function. Used for storing information of the async transfer, allowing to - // wait on it later. - auto &IRBuilder = OMPInfoCache.OMPBuilder; - auto *F = RuntimeCall.getCaller(); - Instruction *FirstInst = &(F->getEntryBlock().front()); - AllocaInst *Handle = new AllocaInst( - IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); - - // Add "issue" runtime call declaration: - // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, - // i8**, i8**, i64*, i64*) - FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___tgt_target_data_begin_mapper_issue); - - // Change RuntimeCall call site for its asynchronous version. - SmallVector Args; - for (auto &Arg : RuntimeCall.args()) - Args.push_back(Arg.get()); - Args.push_back(Handle); - - CallInst *IssueCallsite = - CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); - OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite); - RuntimeCall.eraseFromParent(); - - // Add "wait" runtime call declaration: - // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) - FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( - M, OMPRTL___tgt_target_data_begin_mapper_wait); - - Value *WaitParams[2] = { - IssueCallsite->getArgOperand( - OffloadArray::DeviceIDArgNum), // device_id. - Handle // handle to wait on. - }; - CallInst *WaitCallsite = CallInst::Create( - WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); - OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite); - - return true; - } - static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, bool GlobalOnly, bool &SingleChoice) { if (CurrentIdent == NextIdent) Index: llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll =================================================================== --- llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll +++ llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll @@ -1,528 +1,306 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -S -passes=openmp-opt-cgscc -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s -target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - -; CHECK: %struct.__tgt_async_info = type { i8* } - -%struct.ident_t = type { i32, i32, i32, i32, i8* } -%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } - -@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] -@.__omp_offloading_heavyComputation1.region_id = weak constant i8 0 -@.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 8] -@.offload_maptypes.2 = private unnamed_addr constant [1 x i64] [i64 800] - -@.__omp_offloading_heavyComputation2.region_id = weak constant i8 0 -@.offload_maptypes.3 = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=openmp-opt -openmp-hide-memory-transfer-latency < %s | FileCheck %s -@.__omp_offloading_heavyComputation3.region_id = weak constant i8 0 -@.offload_sizes.2 = private unnamed_addr constant [2 x i64] [i64 4, i64 0] -@.offload_maptypes.4 = private unnamed_addr constant [2 x i64] [i64 800, i64 544] - -@.offload_maptypes.5 = private unnamed_addr constant [1 x i64] [i64 33] - -@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8 -@.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 - -;double heavyComputation1() { -; double a = rand() % 777; -; double random = rand(); -; -; //#pragma omp target data map(a) -; void* args[1]; -; args[0] = &a; -; __tgt_target_data_begin(..., args, ...) -; -; #pragma omp target teams -; for (int i = 0; i < 1000; ++i) { -; a *= i*i / 2; -; } -; -; return random + a; -;} -define dso_local double @heavyComputation1() { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation1() { +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } + +@global = dso_local global i32 0, align 4 +@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 3] +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 +@.str = private unnamed_addr constant [15 x i8] c"*** Result %d\0A\00", align 1 +@.str.1 = private unnamed_addr constant [7 x i8] c"Begin\0A\00", align 1 +@llvm.embedded.object = private constant [976 x i8] c"\10\FF\10\AD\01\00\00\00\D0\03\00\00\00\00\00\00 \00\00\00\00\00\00\00(\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00H\00\00\00\00\00\00\00\02\00\00\00\00\00\00\00\90\00\00\00\00\00\00\00:\03\00\00\00\00\00\00n\00\00\00\00\00\00\00u\00\00\00\00\00\00\00i\00\00\00\00\00\00\00\89\00\00\00\00\00\00\00\00arch\00triple\00nvptx64-nvidia-cuda\00sm_61\00\00; ModuleID = 'offload-openmp-nvptx64-nvidia-cuda.tmp.bc'\0Asource_filename = \22offload.c\22\0Atarget datalayout = \22e-i64:64-i128:128-v16:16-v32:32-n16:32:64\22\0Atarget triple = \22nvptx64-nvidia-cuda\22\0A\0A@__omp_rtl_debug_kind = weak_odr hidden constant i32 0\0A@__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0\0A@__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0\0A@__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0\0A\0A!llvm.module.flags = !{!0, !1, !2, !3, !4}\0A!llvm.ident = !{!5}\0A\0A!0 = !{i32 1, !\22wchar_size\22, i32 4}\0A!1 = !{i32 7, !\22openmp\22, i32 50}\0A!2 = !{i32 7, !\22openmp-device\22, i32 50}\0A!3 = !{i32 7, !\22PIC Level\22, i32 2}\0A!4 = !{i32 7, !\22frame-pointer\22, i32 2}\0A!5 = !{!\22clang version 16.0.0 (https://github.com/llvm/llvm-project.git cf1d9a1fdca258ab56f3060dfa4a303b8127350e)\22}\0A\00\00\00\00\00\00", section ".llvm.offloading", align 8, !exclude !0 +@llvm.compiler.used = appending global [1 x ptr] [ptr @llvm.embedded.object], section "llvm.metadata" + +; Function Attrs: noinline nounwind uwtable +define dso_local void @test_func(i32 noundef %N) #0 { +; CHECK-LABEL: @test_func( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 -; CHECK-NEXT: [[A:%.*]] = alloca double, align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS4:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS5:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to i8* -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 777 -; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to double -; CHECK-NEXT: store double [[CONV]], double* [[A]], align 8 -; CHECK-NEXT: [[CALL1:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0:[0-9]+]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to i64* -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]], i64 0, i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS4]] to i64* -; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS5]], i64 0, i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS5]] to i64* -; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP10]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull [[TMP7]], i8** nonnull [[TMP9]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP11]], 0 -; CHECK-NEXT: br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]] -; CHECK: omp_offload.failed: -; CHECK-NEXT: call void @heavyComputation1FallBack(i64 [[TMP6]]) -; CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] -; CHECK: omp_offload.cont: -; CHECK-NEXT: [[CONV2:%.*]] = sitofp i32 [[CALL1]] to double -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[A]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP12]], [[CONV2]] -; CHECK-NEXT: ret double [[ADD]] -; - - - - - - -entry: - %a = alloca double, align 8 - %.offload_baseptrs = alloca [1 x i8*], align 8 - %.offload_ptrs = alloca [1 x i8*], align 8 - %.offload_baseptrs4 = alloca [1 x i8*], align 8 - %.offload_ptrs5 = alloca [1 x i8*], align 8 - - %0 = bitcast double* %a to i8* - %call = tail call i32 (...) @rand() - %rem = srem i32 %call, 777 - %conv = sitofp i32 %rem to double - store double %conv, double* %a, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here. - %call1 = tail call i32 (...) @rand() - - %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [1 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null) - - %5 = bitcast double* %a to i64* - %6 = load i64, i64* %5, align 8 - %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0 - %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64* - store i64 %6, i64* %8, align 8 - %9 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs5, i64 0, i64 0 - %10 = bitcast [1 x i8*]* %.offload_ptrs5 to i64* - store i64 %6, i64* %10, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here. - %11 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation1.region_id, i32 1, i8** nonnull %7, i8** nonnull %9, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.2, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) - %.not = icmp eq i32 %11, 0 - br i1 %.not, label %omp_offload.cont, label %omp_offload.failed - -omp_offload.failed: ; preds = %entry - call void @heavyComputation1FallBack(i64 %6) - br label %omp_offload.cont - -omp_offload.cont: ; preds = %omp_offload.failed, %entry - %conv2 = sitofp i32 %call1 to double - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null, i8** null) - %12 = load double, double* %a, align 8 - %add = fadd double %12, %conv2 - ret double %add -} - -define internal void @heavyComputation1FallBack(i64 %a) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation1FallBack -; CHECK-SAME: (i64 [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void -; -entry: - ; Fallback for offloading function heavyComputation1. - ret void -} - -;int heavyComputation2(double* a, unsigned size) { -; int random = rand() % 7; -; -; //#pragma omp target data map(a[0:size], size) -; void* args[2]; -; args[0] = &a; -; args[1] = &size; -; __tgt_target_data_begin(..., args, ...) -; -; #pragma omp target teams -; for (int i = 0; i < size; ++i) { -; a[i] = ++a[i] * 3.141624; -; } -; -; return random; -;} -define dso_local i32 @heavyComputation2(double* %a, i32 %size) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation2 -; CHECK-SAME: (double* [[A:%.*]], i32 [[SIZE:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SIZE_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0 -; CHECK-NEXT: store i64 [[TMP0]], i64* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1 -; CHECK-NEXT: store i64 4, i64* [[TMP10]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP17]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP19]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0 -; CHECK-NEXT: br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]] -; CHECK: omp_offload.failed: -; CHECK-NEXT: call void @heavyComputation2FallBack(i64 [[SIZE_CASTED]], double* [[A]]) -; CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] -; CHECK: omp_offload.cont: -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 7 -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: ret i32 [[REM]] -; - - -entry: - %size.addr = alloca i32, align 4 - %.offload_baseptrs = alloca [2 x i8*], align 8 - %.offload_ptrs = alloca [2 x i8*], align 8 - %.offload_sizes = alloca [2 x i64], align 8 - %.offload_baseptrs2 = alloca [2 x i8*], align 8 - %.offload_ptrs3 = alloca [2 x i8*], align 8 - - store i32 %size, i32* %size.addr, align 4 - %call = tail call i32 (...) @rand() - - %conv = zext i32 %size to i64 - %0 = shl nuw nsw i64 %conv, 3 - %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [2 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 - store i64 %0, i64* %5, align 8 - %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 - %7 = bitcast i8** %6 to i32** - store i32* %size.addr, i32** %7, align 8 - %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 - %9 = bitcast i8** %8 to i32** - store i32* %size.addr, i32** %9, align 8 - %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 - store i64 4, i64* %10, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - - %11 = load i32, i32* %size.addr, align 4 - %size.casted = zext i32 %11 to i64 - %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 - %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64* - store i64 %size.casted, i64* %13, align 8 - %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0 - %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64* - store i64 %size.casted, i64* %15, align 8 - %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1 - %17 = bitcast i8** %16 to double** - store double* %a, double** %17, align 8 - %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 - %19 = bitcast i8** %18 to double** - store double* %a, double** %19, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here. - %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation2.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) - %.not = icmp eq i32 %20, 0 - br i1 %.not, label %omp_offload.cont, label %omp_offload.failed - -omp_offload.failed: ; preds = %entry - call void @heavyComputation2FallBack(i64 %size.casted, double* %a) - br label %omp_offload.cont - -omp_offload.cont: ; preds = %omp_offload.failed, %entry - %rem = srem i32 %call, 7 - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - ret i32 %rem -} - -define internal void @heavyComputation2FallBack(i64 %size, double* %a) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation2FallBack -; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) { -; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = mul i64 4, [[CONV]] +; CHECK-NEXT: [[CALL:%.*]] = call noalias ptr @malloc(i64 noundef [[MUL]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I_0]], 8 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I_0]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[N]] to i64 +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 4, [[CONV2]] +; CHECK-NEXT: [[CALL4:%.*]] = call noalias ptr @malloc(i64 noundef [[MUL3]]) #[[ATTR5]] +; CHECK-NEXT: call void @__openmp_mapper_issue_wrapper_test_func.mapper.bb(ptr [[DOTOFFLOAD_BASEPTRS]], ptr [[CALL]], ptr [[DOTOFFLOAD_PTRS]], ptr [[DOTOFFLOAD_SIZES]], i32 [[N]], ptr [[HANDLE]]) +; CHECK-NEXT: br label [[FOR_COND6:%.*]] +; CHECK: for.cond6: +; CHECK-NEXT: [[I5_0:%.*]] = phi i32 [ 0, [[FOR_END]] ], [ [[INC13:%.*]], [[FOR_INC12:%.*]] ] +; CHECK-NEXT: [[CMP7:%.*]] = icmp slt i32 [[I5_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY9:%.*]], label [[FOR_END14:%.*]] +; CHECK: for.body9: +; CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[I5_0]] to i64 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[CALL4]], i64 [[IDXPROM10]] +; CHECK-NEXT: store i32 [[I5_0]], ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: br label [[FOR_INC12]] +; CHECK: for.inc12: +; CHECK-NEXT: [[INC13]] = add nsw i32 [[I5_0]], 1 +; CHECK-NEXT: br label [[FOR_COND6]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: for.end14: +; CHECK-NEXT: br label [[FOR_COND16:%.*]] +; CHECK: for.cond16: +; CHECK-NEXT: [[I15_0:%.*]] = phi i32 [ 0, [[FOR_END14]] ], [ [[INC38:%.*]], [[FOR_INC37:%.*]] ] +; CHECK-NEXT: [[CMP17:%.*]] = icmp slt i32 [[I15_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY19:%.*]], label [[FOR_END39:%.*]] +; CHECK: for.body19: +; CHECK-NEXT: br label [[FOR_COND20:%.*]] +; CHECK: for.cond20: +; CHECK-NEXT: [[J_0:%.*]] = phi i32 [ 0, [[FOR_BODY19]] ], [ [[INC35:%.*]], [[FOR_INC34:%.*]] ] +; CHECK-NEXT: [[CMP21:%.*]] = icmp slt i32 [[J_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_BODY23:%.*]], label [[FOR_END36:%.*]] +; CHECK: for.body23: +; CHECK-NEXT: br label [[FOR_COND24:%.*]] +; CHECK: for.cond24: +; CHECK-NEXT: [[K_0:%.*]] = phi i32 [ 0, [[FOR_BODY23]] ], [ [[INC32:%.*]], [[FOR_INC31:%.*]] ] +; CHECK-NEXT: [[CMP25:%.*]] = icmp slt i32 [[K_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP25]], label [[FOR_BODY27:%.*]], label [[FOR_END33:%.*]] +; CHECK: for.body27: +; CHECK-NEXT: [[ADD28:%.*]] = add nsw i32 [[I15_0]], [[J_0]] +; CHECK-NEXT: [[ADD29:%.*]] = add nsw i32 [[ADD28]], [[K_0]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @global, align 4 +; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP0]], [[ADD29]] +; CHECK-NEXT: store i32 [[ADD30]], ptr @global, align 4 +; CHECK-NEXT: br label [[FOR_INC31]] +; CHECK: for.inc31: +; CHECK-NEXT: [[INC32]] = add nsw i32 [[K_0]], 1 +; CHECK-NEXT: br label [[FOR_COND24]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end33: +; CHECK-NEXT: br label [[FOR_INC34]] +; CHECK: for.inc34: +; CHECK-NEXT: [[INC35]] = add nsw i32 [[J_0]], 1 +; CHECK-NEXT: br label [[FOR_COND20]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.end36: +; CHECK-NEXT: br label [[FOR_INC37]] +; CHECK: for.inc37: +; CHECK-NEXT: [[INC38]] = add nsw i32 [[I15_0]], 1 +; CHECK-NEXT: br label [[FOR_COND16]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: for.end39: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @global, align 4 +; CHECK-NEXT: [[INC40:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: store i32 [[INC40]], ptr @global, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0 +; CHECK-NEXT: br label [[CODEREPL:%.*]] +; CHECK: codeRepl: +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[N]], 1 +; CHECK-NEXT: [[IDXPROM43:%.*]] = sext i32 [[SUB]] to i64 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, ptr [[CALL]], i64 [[IDXPROM43]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(ptr @[[GLOB1:[0-9]+]], i64 -1, ptr [[HANDLE]]) +; CHECK-NEXT: [[CALL45:%.*]] = call i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0 +; CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @[[GLOB1]], i64 -1, i32 1, ptr [[TMP13]], ptr [[TMP14]], ptr [[TMP15]], ptr @.offload_maptypes, ptr null, ptr null) ; CHECK-NEXT: ret void ; entry: - ; Fallback for offloading function heavyComputation2. + %.offload_baseptrs = alloca [1 x ptr], align 8 + %.offload_ptrs = alloca [1 x ptr], align 8 + %.offload_mappers = alloca [1 x ptr], align 8 + %.offload_sizes = alloca [1 x i64], align 8 + %conv = sext i32 %N to i64 + %mul = mul i64 4, %conv + %call = call noalias ptr @malloc(i64 noundef %mul) #4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %N + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %i.0, 8 + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32, ptr %call, i64 %idxprom + store i32 %add, ptr %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond, !llvm.loop !9 + +for.end: ; preds = %for.cond + %conv2 = sext i32 %N to i64 + %mul3 = mul i64 4, %conv2 + %call4 = call noalias ptr @malloc(i64 noundef %mul3) #4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc12, %for.end + %i5.0 = phi i32 [ 0, %for.end ], [ %inc13, %for.inc12 ] + %cmp7 = icmp slt i32 %i5.0, %N + br i1 %cmp7, label %for.body9, label %for.end14 + +for.body9: ; preds = %for.cond6 + %idxprom10 = sext i32 %i5.0 to i64 + %arrayidx11 = getelementptr inbounds i32, ptr %call4, i64 %idxprom10 + store i32 %i5.0, ptr %arrayidx11, align 4 + br label %for.inc12 + +for.inc12: ; preds = %for.body9 + %inc13 = add nsw i32 %i5.0, 1 + br label %for.cond6, !llvm.loop !11 + +for.end14: ; preds = %for.cond6 + br label %for.cond16 + +for.cond16: ; preds = %for.inc37, %for.end14 + %i15.0 = phi i32 [ 0, %for.end14 ], [ %inc38, %for.inc37 ] + %cmp17 = icmp slt i32 %i15.0, %N + br i1 %cmp17, label %for.body19, label %for.end39 + +for.body19: ; preds = %for.cond16 + br label %for.cond20 + +for.cond20: ; preds = %for.inc34, %for.body19 + %j.0 = phi i32 [ 0, %for.body19 ], [ %inc35, %for.inc34 ] + %cmp21 = icmp slt i32 %j.0, %N + br i1 %cmp21, label %for.body23, label %for.end36 + +for.body23: ; preds = %for.cond20 + br label %for.cond24 + +for.cond24: ; preds = %for.inc31, %for.body23 + %k.0 = phi i32 [ 0, %for.body23 ], [ %inc32, %for.inc31 ] + %cmp25 = icmp slt i32 %k.0, %N + br i1 %cmp25, label %for.body27, label %for.end33 + +for.body27: ; preds = %for.cond24 + %add28 = add nsw i32 %i15.0, %j.0 + %add29 = add nsw i32 %add28, %k.0 + %0 = load i32, ptr @global, align 4 + %add30 = add nsw i32 %0, %add29 + store i32 %add30, ptr @global, align 4 + br label %for.inc31 + +for.inc31: ; preds = %for.body27 + %inc32 = add nsw i32 %k.0, 1 + br label %for.cond24, !llvm.loop !12 + +for.end33: ; preds = %for.cond24 + br label %for.inc34 + +for.inc34: ; preds = %for.end33 + %inc35 = add nsw i32 %j.0, 1 + br label %for.cond20, !llvm.loop !13 + +for.end36: ; preds = %for.cond20 + br label %for.inc37 + +for.inc37: ; preds = %for.end36 + %inc38 = add nsw i32 %i15.0, 1 + br label %for.cond16, !llvm.loop !14 + +for.end39: ; preds = %for.cond16 + %1 = load i32, ptr @global, align 4 + %inc40 = add nsw i32 %1, 1 + store i32 %inc40, ptr @global, align 4 + %arrayidx41 = getelementptr inbounds i32, ptr %call, i64 0 + %conv42 = sext i32 %N to i64 + %2 = mul nuw i64 %conv42, 4 + %3 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 + store ptr %call, ptr %3, align 8 + %4 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 + store ptr %arrayidx41, ptr %4, align 8 + %5 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 + store i64 %2, ptr %5, align 8 + %6 = getelementptr inbounds [1 x ptr], ptr %.offload_mappers, i64 0, i64 0 + store ptr null, ptr %6, align 8 + %7 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 + %8 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 + %9 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 + call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %7, ptr %8, ptr %9, ptr @.offload_maptypes, ptr null, ptr null) + %sub = sub nsw i32 %N, 1 + %idxprom43 = sext i32 %sub to i64 + %arrayidx44 = getelementptr inbounds i32, ptr %call, i64 %idxprom43 + %10 = load i32, ptr %arrayidx44, align 4 + %call45 = call i32 (ptr, ...) @printf(ptr noundef @.str, i32 noundef %10) + %11 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0 + %12 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0 + %13 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0 + call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %11, ptr %12, ptr %13, ptr @.offload_maptypes, ptr null, ptr null) ret void } -;int heavyComputation3(double* restrict a, unsigned size) { -; int random = rand() % 7; -; -; //#pragma omp target data map(a[0:size], size) -; void* args[2]; -; args[0] = &a; -; args[1] = &size; -; __tgt_target_data_begin(..., args, ...) -; -; #pragma omp target teams -; for (int i = 0; i < size; ++i) { -; a[i] = ++a[i] * 3.141624; -; } -; -; return random; -;} -define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation3 -; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SIZE_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [2 x i64], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS3:%.*]] = alloca [2 x i8*], align 8 -; CHECK-NEXT: store i32 [[SIZE]], i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0 -; CHECK-NEXT: store i64 [[TMP0]], i64* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8** [[TMP8]] to i32** -; CHECK-NEXT: store i32* [[SIZE_ADDR]], i32** [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 1 -; CHECK-NEXT: store i64 4, i64* [[TMP10]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[SIZE_ADDR]], align 4 -; CHECK-NEXT: [[SIZE_CASTED:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast [2 x i8*]* [[DOTOFFLOAD_PTRS3]] to i64* -; CHECK-NEXT: store i64 [[SIZE_CASTED]], i64* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS2]], i64 0, i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8** [[TMP16]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP17]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS3]], i64 0, i64 1 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8** [[TMP18]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP19]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull [[TMP12]], i8** nonnull [[TMP14]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP20]], 0 -; CHECK-NEXT: br i1 [[DOTNOT]], label [[OMP_OFFLOAD_CONT:%.*]], label [[OMP_OFFLOAD_FAILED:%.*]] -; CHECK: omp_offload.failed: -; CHECK-NEXT: call void @heavyComputation3FallBack(i64 [[SIZE_CASTED]], double* [[A]]) -; CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] -; CHECK: omp_offload.cont: -; CHECK-NEXT: [[REM:%.*]] = srem i32 [[CALL]], 7 -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 2, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: ret i32 [[REM]] -; +; Function Attrs: nounwind allocsize(0) +declare noalias ptr @malloc(i64 noundef) #1 +; Function Attrs: nounwind +declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) #2 -entry: - %size.addr = alloca i32, align 4 - %.offload_baseptrs = alloca [2 x i8*], align 8 - %.offload_ptrs = alloca [2 x i8*], align 8 - %.offload_sizes = alloca [2 x i64], align 8 - %.offload_baseptrs2 = alloca [2 x i8*], align 8 - %.offload_ptrs3 = alloca [2 x i8*], align 8 - store i32 %size, i32* %size.addr, align 4 - - ; FIXME: call to @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @0, ...) should be moved here. - %call = tail call i32 (...) @rand() - - %conv = zext i32 %size to i64 - %0 = shl nuw nsw i64 %conv, 3 - %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [2 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [2 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0 - store i64 %0, i64* %5, align 8 - %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1 - %7 = bitcast i8** %6 to i32** - store i32* %size.addr, i32** %7, align 8 - %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1 - %9 = bitcast i8** %8 to i32** - store i32* %size.addr, i32** %9, align 8 - %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1 - store i64 4, i64* %10, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - - %11 = load i32, i32* %size.addr, align 4 - %size.casted = zext i32 %11 to i64 - %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0 - %13 = bitcast [2 x i8*]* %.offload_baseptrs2 to i64* - store i64 %size.casted, i64* %13, align 8 - %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 0 - %15 = bitcast [2 x i8*]* %.offload_ptrs3 to i64* - store i64 %size.casted, i64* %15, align 8 - %16 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 1 - %17 = bitcast i8** %16 to double** - store double* %a, double** %17, align 8 - %18 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs3, i64 0, i64 1 - %19 = bitcast i8** %18 to double** - store double* %a, double** %19, align 8 - - ; FIXME: call to @__tgt_target_data_begin_mapper_wait(...) should be moved here. - %20 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @0, i64 -1, i8* nonnull @.__omp_offloading_heavyComputation3.region_id, i32 2, i8** nonnull %12, i8** nonnull %14, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes.2, i64 0, i64 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.4, i64 0, i64 0), i8** null, i8** null, i32 0, i32 0) - %.not = icmp eq i32 %20, 0 - br i1 %.not, label %omp_offload.cont, label %omp_offload.failed - -omp_offload.failed: ; preds = %entry - call void @heavyComputation3FallBack(i64 %size.casted, double* %a) - br label %omp_offload.cont - -omp_offload.cont: ; preds = %omp_offload.failed, %entry - %rem = srem i32 %call, 7 - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null, i8** null) - ret i32 %rem -} +declare i32 @printf(ptr noundef, ...) #3 -define internal void @heavyComputation3FallBack(i64 %size, double* %a) { -; CHECK-LABEL: define {{[^@]+}}@heavyComputation3FallBack -; CHECK-SAME: (i64 [[SIZE:%.*]], double* [[A:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void -; -entry: - ; Fallback for offloading function heavyComputation3. - ret void -} +; Function Attrs: nounwind +declare void @__tgt_target_data_end_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr) #2 -;int dataTransferOnly1(double* restrict a, unsigned size) { -; // Random computation. -; int random = rand(); -; -; //#pragma omp target data map(to:a[0:size]) -; void* args[1]; -; args[0] = &a; -; __tgt_target_data_begin(..., args, ...) -; -; // Random computation. -; random %= size; -; return random; -;} -define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) { -; CHECK-LABEL: define {{[^@]+}}@dataTransferOnly1 -; CHECK-SAME: (double* noalias [[A:%.*]], i32 [[SIZE:%.*]]) { +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @main(i32 noundef %argc, ptr noundef %argv) #0 { +; CHECK-LABEL: @main( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[HANDLE:%.*]] = alloca [[STRUCT___TGT_ASYNC_INFO:%.*]], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [1 x i64], align 8 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @rand() -; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[SIZE]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[CONV]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_BASEPTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOFFLOAD_PTRS]], i64 0, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[DOTOFFLOAD_PTRS]] to double** -; CHECK-NEXT: store double* [[A]], double** [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x i64], [1 x i64]* [[DOTOFFLOAD_SIZES]], i64 0, i64 0 -; CHECK-NEXT: store i64 [[TMP0]], i64* [[TMP5]], align 8 -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_issue(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** [[TMP1]], i8** [[TMP3]], i64* [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: [[REM:%.*]] = urem i32 [[CALL]], [[SIZE]] -; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info* [[HANDLE]]) -; CHECK-NEXT: call void @__tgt_target_data_end_mapper(%struct.ident_t* @[[GLOB0]], i64 -1, i32 1, i8** nonnull [[TMP1]], i8** nonnull [[TMP3]], i64* nonnull [[TMP5]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null) -; CHECK-NEXT: ret i32 [[REM]] +; CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr, ...) @printf(ptr noundef @.str.1) +; CHECK-NEXT: call void @test_func(i32 noundef 10) +; CHECK-NEXT: ret i32 0 ; - - - - - - entry: - %.offload_baseptrs = alloca [1 x i8*], align 8 - %.offload_ptrs = alloca [1 x i8*], align 8 - %.offload_sizes = alloca [1 x i64], align 8 - - ; FIXME: call to @__tgt_target_data_begin_issue_mapper(...) should be moved here. - %call = tail call i32 (...) @rand() - - %conv = zext i32 %size to i64 - %0 = shl nuw nsw i64 %conv, 3 - %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 - %2 = bitcast [1 x i8*]* %.offload_baseptrs to double** - store double* %a, double** %2, align 8 - %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 - %4 = bitcast [1 x i8*]* %.offload_ptrs to double** - store double* %a, double** %4, align 8 - %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0 - store i64 %0, i64* %5, align 8 - call void @__tgt_target_data_begin_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null) - - %rem = urem i32 %call, %size - - call void @__tgt_target_data_end_mapper(%struct.ident_t* @0, i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null, i8** null) - ret i32 %rem + %call = call i32 (ptr, ...) @printf(ptr noundef @.str.1) + call void @test_func(i32 noundef 10) + ret i32 0 } -declare void @__tgt_target_data_begin_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**) -declare i32 @__tgt_target_teams_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**, i32, i32) -declare void @__tgt_target_data_end_mapper(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**) - -declare dso_local i32 @rand(...) - -; CHECK: declare void @__tgt_target_data_begin_mapper_issue(%struct.ident_t*, i64, i32, i8**, i8**, i64*, i64*, i8**, i8**, %struct.__tgt_async_info*) -; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info*) - -!llvm.module.flags = !{!0} - -!0 = !{i32 7, !"openmp", i32 50} +attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nounwind allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { nounwind } +attributes #3 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { nounwind allocsize(0) } + +!llvm.module.flags = !{!1, !2, !3, !4, !5, !6} +!llvm.ident = !{!7} +!llvm.embedded.objects = !{!8} + +!0 = !{} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"openmp", i32 50} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{i32 7, !"PIE Level", i32 2} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git cf1d9a1fdca258ab56f3060dfa4a303b8127350e)"} +!8 = !{ptr @llvm.embedded.object, !".llvm.offloading"} +!9 = distinct !{!9, !10} +!10 = !{!"llvm.loop.mustprogress"} +!11 = distinct !{!11, !10} +!12 = distinct !{!12, !10} +!13 = distinct !{!13, !10} +!14 = distinct !{!14, !10}