Index: clang/lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- clang/lib/CodeGen/CGStmtOpenMP.cpp +++ clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2664,59 +2664,102 @@ } void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) { - bool UseOMPIRBuilder = - CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S); + bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder; + //bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsDevice; if (UseOMPIRBuilder) { - auto &&CodeGenIRBuilder = [this, &S, UseOMPIRBuilder](CodeGenFunction &CGF, - PrePostActionTy &) { - // Use the OpenMPIRBuilder if enabled. - if (UseOMPIRBuilder) { - llvm::MapVector AlignedVars = - GetAlignedMapping(S, CGF); - // Emit the associated statement and get its loop representation. - const Stmt *Inner = S.getRawStmt(); - llvm::CanonicalLoopInfo *CLI = - EmitOMPCollapsedCanonicalLoopNest(Inner, 1); - - llvm::OpenMPIRBuilder &OMPBuilder = - CGM.getOpenMPRuntime().getOMPBuilder(); - // Add SIMD specific metadata - llvm::ConstantInt *Simdlen = nullptr; - if (const auto *C = S.getSingleClause()) { - RValue Len = - this->EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(), - /*ignoreResult=*/true); - auto *Val = cast(Len.getScalarVal()); - Simdlen = Val; - } - llvm::ConstantInt *Safelen = nullptr; - if (const auto *C = S.getSingleClause()) { - RValue Len = - this->EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(), - /*ignoreResult=*/true); - auto *Val = cast(Len.getScalarVal()); - Safelen = Val; - } - llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown; - if (const auto *C = S.getSingleClause()) { - if (C->getKind() == OpenMPOrderClauseKind ::OMPC_ORDER_concurrent) { - Order = llvm::omp::OrderKind::OMP_ORDER_concurrent; + auto *CS = dyn_cast(S.getAssociatedStmt()); + auto *CL = dyn_cast(CS->getCapturedStmt()); + CGCapturedStmtInfo CGSI(*CS, CR_OpenMP); + + CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI); + llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( + AllocaInsertPt->getParent(), AllocaInsertPt->getIterator()); + + const auto *For = dyn_cast(CL->getLoopStmt()); + if(const Stmt *InitStmt = For->getInit()) + EmitStmt(InitStmt); + const DeclRefExpr *LoopVarRef = CL->getLoopVarRef(); + LValue LCVal = EmitLValue(LoopVarRef); + Address LoopVarAddress = LCVal.getAddress(*this); + llvm::AllocaInst *LoopVar = dyn_cast(LoopVarAddress.getPointer()); + + llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder(); + + using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; + + // FIXME check if trip count is signed + auto DistanceCB = [this, CL, LoopVar](InsertPointTy CodeGenIP, llvm::Value *&TripCount, bool &Signed) -> void { + Builder.restoreIP(CodeGenIP); + + const CapturedStmt *DistanceFunc = CL->getDistanceFunc(); + EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc); + + QualType LogicalTy = DistanceFunc->getCapturedDecl() + ->getParam(0) + ->getType() + .getNonReferenceType(); + Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr"); + emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()}); + TripCount = Builder.CreateLoad(CountAddr, ".count"); + + return; + }; + + auto FiniCB = [this](InsertPointTy IP) { + OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP); + }; + + auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) { + ReplVal = &Val; + return CodeGenIP; + }; + + const Stmt *loopBody = S.getBody(); + auto BodyGenCB = [loopBody, this, CL, LoopVar, &S] + (InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + llvm::Value *Virtual) { + llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock(); + + OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody( + *this, + loopBody, + AllocaIP, + CodeGenIP, + "simd"); + + Builder.restoreIP(AllocaIP); + llvm::AllocaInst *NewLoopVar = + Builder.CreateAlloca(LoopVar->getAllocatedType(), LoopVar->getAddressSpace(), + LoopVar->getArraySize(), LoopVar->getName()+".loopvar"); + + for(llvm::User *U : LoopVar->users()) { + if(auto I = dyn_cast(U)) { + if(I->getParent() == CodeGenIPBB) { + U->replaceUsesOfWith(LoopVar, NewLoopVar); } } - // Add simd metadata to the collapsed loop. Do not generate - // another loop for if clause. Support for if clause is done earlier. - OMPBuilder.applySimd(CLI, AlignedVars, - /*IfCond*/ nullptr, Order, Simdlen, Safelen); - return; } + + const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc(); + EmittedClosureTy LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc); + Builder.SetInsertPoint(CodeGenIPBB, CodeGenIPBB->begin()); + emitCapturedStmtCall(*this, LoopVarClosure, + {NewLoopVar, Virtual}); + }; - { - auto LPCRegion = - CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); - OMPLexicalScope Scope(*this, S, OMPD_unknown); - CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_simd, - CodeGenIRBuilder); - } + + Builder.restoreIP( + OMPBuilder.createSimdLoop( + Builder, + AllocaIP, + BodyGenCB, + DistanceCB, + PrivCB, + FiniCB, + /*SPMD*/ true + )); + return; } Index: clang/lib/CodeGen/CodeGenFunction.cpp =================================================================== --- clang/lib/CodeGen/CodeGenFunction.cpp +++ clang/lib/CodeGen/CodeGenFunction.cpp @@ -94,7 +94,8 @@ // seems to be a reasonable spot. We do it here, as opposed to the deletion // time of the CodeGenModule, because we have to ensure the IR has not yet // been "emitted" to the outside, thus, modifications are still sensible. - if (CGM.getLangOpts().OpenMPIRBuilder && CurFn) + //if (CGM.getLangOpts().OpenMPIRBuilder && CurFn) + if (CurFn) CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn); } Index: clang/lib/Parse/ParseOpenMP.cpp =================================================================== --- clang/lib/Parse/ParseOpenMP.cpp +++ clang/lib/Parse/ParseOpenMP.cpp @@ -2953,7 +2953,7 @@ Sema::CompoundScopeRAII Scope(Actions); AssociatedStmt = ParseStatement(); - if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) && + if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) && getLangOpts().OpenMPIRBuilder) AssociatedStmt = Actions.ActOnOpenMPLoopnest(AssociatedStmt.get()); } Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -506,6 +506,16 @@ using BodyGenCallbackTy = function_ref; + using LoopBodyCallbackTy = + function_ref; + + using TripCountCallbackTy = + function_ref< + void(InsertPointTy CodeGenIP, Value *&TripCount, bool &Signed) + >; + // This is created primarily for sections construct as llvm::function_ref // (BodyGenCallbackTy) is not storable (as described in the comments of // function_ref class - function_ref contains non-ownable reference @@ -605,6 +615,15 @@ Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable); + IRBuilder<>::InsertPoint + createSimdLoop(const LocationDescription &Loc, InsertPointTy AllocaIP, + LoopBodyCallbackTy BodyGenCB, + TripCountCallbackTy DistanceCB, + PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB, + bool SPMDMode); + + /// Generator for the control flow structure of an OpenMP canonical loop. /// /// This generator operates on the logical iteration space of the loop, i.e. Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -119,6 +119,8 @@ __OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32) __OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr) +__OMP_FUNCTION_TYPE(LoopTask, false, Void, Int64, VoidPtrPtr) + #undef __OMP_FUNCTION_TYPE #undef OMP_FUNCTION_TYPE @@ -484,6 +486,8 @@ __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__kmpc_simd_51, false, Void, IdentPtr, LoopTaskPtr, Int64, VoidPtrPtr, Int32) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp =================================================================== --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -454,6 +454,7 @@ for (OutlineInfo &OI : OutlineInfos) { // Skip functions that have not finalized yet; may happen with nested // function generation. + if (Fn && OI.getFunction() != Fn) { DeferredOutlines.push_back(OI); continue; @@ -462,7 +463,6 @@ ParallelRegionBlockSet.clear(); Blocks.clear(); OI.collectBlocks(ParallelRegionBlockSet, Blocks); - Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, @@ -481,11 +481,10 @@ assert(Extractor.isEligible() && "Expected OpenMP outlining to be possible!"); - for (auto *V : OI.ExcludeArgsFromAggregate) + for (auto *V : OI.ExcludeArgsFromAggregate) { Extractor.excludeArgFromAggregate(V); - + } Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); - LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n"); assert(OutlinedFn->getReturnType()->isVoidTy() && @@ -1230,6 +1229,308 @@ return AfterIP; } + +IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( + const LocationDescription &Loc, InsertPointTy OuterAllocaIP, + LoopBodyCallbackTy BodyGenCB, + TripCountCallbackTy DistanceCB, + PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB, + bool SPMDMode +) +{ + assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); + + if (!updateToLocation(Loc)) + return Loc.IP; + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = getOrCreateThreadID(Ident); + + BasicBlock *InsertBB = Builder.GetInsertBlock(); + Function *OuterFn = InsertBB->getParent(); + + LLVM_DEBUG(dbgs() << "At the start of createSimdLoop: " << *OuterFn << "\n"); + + // Save the outer alloca block because the insertion iterator may get + // invalidated and we still need this later. + BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); + + // Vector to remember instructions we used only during the modeling but which + // we want to delete at the end. + SmallVector ToBeDeleted; + + // Create an artificial insertion point that will also ensure the blocks we + // are about to split are not degenerated. + auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); + + Instruction *ThenTI = UI, *ElseTI = nullptr; + + BasicBlock *ThenBB = ThenTI->getParent(); + BasicBlock *LRegDistanceBB = ThenBB->splitBasicBlock(ThenTI, "omp.loop.distance"); + BasicBlock *PRegEntryBB = LRegDistanceBB->splitBasicBlock(ThenTI, "omp.loop.entry"); + BasicBlock *PRegBodyBB = + PRegEntryBB->splitBasicBlock(ThenTI, "omp.loop.region"); + BasicBlock *PRegPreFiniBB = + PRegBodyBB->splitBasicBlock(ThenTI, "omp.loop.pre_finalize"); + BasicBlock *PRegExitBB = + PRegPreFiniBB->splitBasicBlock(ThenTI, "omp.loop.exit"); + + + auto FiniCBWrapper = [&](InsertPointTy IP) { + // Hide "open-ended" blocks from the given FiniCB by setting the right jump + // target to the region exit blocks + if (IP.getBlock()->end() == IP.getPoint()) { + IRBuilder<>::InsertPointGuard IPG(Builder); + Builder.restoreIP(IP); + Instruction *I = Builder.CreateBr(PRegExitBB); + IP = InsertPointTy(I->getParent(), I->getIterator()); + } + assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 && + IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB && + "Unexpected insertion point for finalization call!"); + return FiniCB(IP); + }; + + FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, false}); + + // Compute the loop trip count + // Insert after the outer alloca to ensure all variables need + // in its calculation are ready + InsertPointTy DistanceIP(LRegDistanceBB, LRegDistanceBB->begin()); + Value *DistVal; + bool IsTripCountSigned; + assert(DistanceCB && "expected loop trip count callback function!"); + DistanceCB(DistanceIP, DistVal, IsTripCountSigned); + assert(DistVal && "trip count call back should return integer trip count"); + Type *DistValType = DistVal->getType(); + assert(DistValType->isIntegerTy() && "trip count should be integer type"); + + // Possibly need to cast DistVal to Int64 + // FIXME if the integer is signed it needs to be converted to + // unsigned. If the tripcount is <0 it's fine to just convert + // it to 0. + if(!DistValType->isIntegerTy(64)) { + Builder.SetInsertPoint(LRegDistanceBB->getTerminator()); + DistVal = Builder.CreateIntCast( + DistVal, Int64, /*IsTripCountSigned*/ false, DistVal->getName()+".casted"); + } + + LLVM_DEBUG(dbgs() << "After DistanceCB: " << *LRegDistanceBB << "\n"); + LLVM_DEBUG(dbgs() << "Trip count variable: " << *DistVal << "\n"); + + // Create the virtual iteration variable that will be pulled into + // the outlined function. + Builder.restoreIP(OuterAllocaIP); + AllocaInst *OMPIVAlloca = Builder.CreateAlloca(Int64, nullptr, "omp.iv.tmp"); + LoadInst *OMPIVLoad = Builder.CreateLoad(Int64, OMPIVAlloca, "omp.iv"); + + // Generate the privatization allocas in the block that will become the entry + // of the outlined function. + Builder.SetInsertPoint(PRegEntryBB->getTerminator()); + InsertPointTy InnerAllocaIP = Builder.saveIP(); + + // Use omp.iv in the outlined region. Cast it if needed. + Instruction *OMPIV; + if(!DistValType->isIntegerTy(64)) { + // Cast omp.iv to the same type as the trip count. + // If the cast is needed, keep it in the outlined region + OMPIV = dyn_cast( + Builder.CreateTrunc(OMPIVLoad, DistValType, "omp.iv.casted")); + } else { + // If cast is unneeded, we still need to generate a fake use of + // omp.iv so the outlined function picks it up as the first arg + Instruction *OMPIVUse = dyn_cast( + Builder.CreateAdd(OMPIVLoad, Builder.getInt64(0), "omp.iv.tobedeleted")); + OMPIV = OMPIVLoad; + ToBeDeleted.push_back(OMPIVUse); + } + + // Order matters + ToBeDeleted.push_back(OMPIVLoad); + ToBeDeleted.push_back(OMPIVAlloca); + + LLVM_DEBUG(llvm::dbgs() << "omp.iv variable generated: " << *OuterFn << "\n"); + + LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n"); + assert(BodyGenCB && "Expected body generation callback!"); + InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); + + BodyGenCB(InnerAllocaIP, CodeGenIP, OMPIV); + + LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); + + FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_simd_51); + + OutlineInfo OI; + + // Adjust the finalization stack, verify the adjustment, and call the + // finalize function a last time to finalize values between the pre-fini + // block and the exit block if we left the parallel "the normal way". + auto FiniInfo = FinalizationStack.pop_back_val(); + (void)FiniInfo; + assert(FiniInfo.DK == OMPD_parallel && + "Unexpected finalization stack state!"); + + Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator(); + + InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator()); + FiniCB(PreFiniIP); + + OI.OuterAllocaBB = OuterAllocaBlock; + OI.EntryBB = PRegEntryBB; + OI.ExitBB = PRegExitBB; + + SmallPtrSet ParallelRegionBlockSet; + SmallVector Blocks; + OI.collectBlocks(ParallelRegionBlockSet, Blocks); + + // Ensure a single exit node for the outlined region by creating one. + // We might have multiple incoming edges to the exit now due to finalizations, + // e.g., cancel calls that cause the control flow to leave the region. + BasicBlock *PRegOutlinedExitBB = PRegExitBB; + PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt()); + PRegOutlinedExitBB->setName("omp.loop.outlined.exit"); + Blocks.push_back(PRegOutlinedExitBB); + + CodeExtractorAnalysisCache CEAC(*OuterFn); + + CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, + /* AggregateArgs */ true, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ false, + /* AllowAlloca */ true, + /* AllocationBlock */ OuterAllocaBlock, + /* Suffix */ ".omp_simd"); + + BasicBlock *CommonExit = nullptr; + SetVector Inputs, Outputs, SinkingCands, HoistingCands; + Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); + Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); + + LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n"); + + auto PrivHelper = [&](Value &V) { + // Exclude omp.iv from aggregate + if (&V == OMPIVLoad) { + OI.ExcludeArgsFromAggregate.push_back(&V); + return; + } + + // Get all uses of value that are inside of the outlined region + SetVector Uses; + for (Use &U : V.uses()) + if (auto *UserI = dyn_cast(U.getUser())) + if (ParallelRegionBlockSet.count(UserI->getParent())) + Uses.insert(&U); + + Value *Inner = &V; + + // If the value isn't a pointer type, store it in a pointer + // Unpack it inside the outlined region + if (!V.getType()->isPointerTy()) { + IRBuilder<>::InsertPointGuard Guard(Builder); + LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); + + Builder.restoreIP(OuterAllocaIP); + Value *Ptr = Builder.CreateAlloca( + V.getType(), nullptr, V.getName() + ".reloaded"); + + // Store to stack at end of the block that currently branches to the entry + // block of the to-be-outlined region. + Builder.SetInsertPoint( + InsertBB, InsertBB->getTerminator()->getIterator()); + Builder.CreateStore(&V, Ptr); + + // Load back next to allocations in the to-be-outlined region. + Builder.restoreIP(InnerAllocaIP); + Inner = Builder.CreateLoad(V.getType(), Ptr); + } + + Value *ReplacementValue = nullptr; + Builder.restoreIP( + PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue)); + assert(ReplacementValue && + "Expected copy/create callback to set replacement value!"); + if (ReplacementValue == &V) + return; + + for (Use *UPtr : Uses) + UPtr->set(ReplacementValue); + + }; + + InnerAllocaIP = IRBuilder<>::InsertPoint( + OMPIV->getParent(), OMPIV->getNextNode()->getIterator()); + + // Reset the outer alloca insertion point to the entry of the relevant block + // in case it was invalidated. + OuterAllocaIP = IRBuilder<>::InsertPoint( + OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); + + for (Value *Input : Inputs) { + PrivHelper(*Input); + } + + assert(Outputs.empty() && + "OpenMP outlining should not produce live-out values!"); + + LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); + for (auto *BB : Blocks) { + LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n"); + } + + int NumInputs = Inputs.size()-1; // One argument is always omp.iv + OI.PostOutlineCB = [=](Function &OutlinedFn) { + OutlinedFn.addFnAttr(Attribute::NoUnwind); + OutlinedFn.addFnAttr(Attribute::NoRecurse); + + assert(OutlinedFn.arg_size() == 2 && + "Expected omp.iv & structArg as arguments"); + + CallInst *CI = cast(OutlinedFn.user_back()); + BasicBlock *CallBlock = CI->getParent(); + CallBlock->setName("omp_loop"); + Builder.SetInsertPoint(CI); + + Value * StructArg = CI->getArgOperand(1); // 0 should be omp.iv + + Value *SimdArgs[] = { + Ident, + Builder.CreateBitCast(&OutlinedFn, LoopTaskPtr), + DistVal, + Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr), + Builder.getInt32(NumInputs)}; + + SmallVector RealArgs; + RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs)); + + CallInst *Simd51Call = Builder.CreateCall(RTLFn, RealArgs); + + LLVM_DEBUG(dbgs() << "With runtime call placed: " << *Builder.GetInsertBlock()->getParent() << "\n"); + + InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end()); + + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) + I->eraseFromParent(); + + }; + + addOutlineInfo(std::move(OI)); + + InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end()); + UI->eraseFromParent(); + + return AfterIP; + +} + void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) { // Build call void __kmpc_flush(ident_t *loc) uint32_t SrcLocStrSize; Index: openmp/libomptarget/DeviceRTL/src/Workshare.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -658,6 +658,19 @@ void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) { FunctionTracingRAII(); } + +void __kmpc_simd_51( + IdentTy *ident, void *WorkFn, uint64_t TripCount, + void **Args, uint32_t nargs +) { + FunctionTracingRAII(); + + ASSERT(WorkFn); + for(uint64_t omp_iv = 0; omp_iv < TripCount; omp_iv++) { + ((void (*)(uint64_t, void**))WorkFn)(omp_iv, Args); + } + +} } #pragma omp end declare target Index: openmp/libomptarget/test/offloading/simd.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/offloading/simd.c @@ -0,0 +1,27 @@ +#include +#include +#include + +int main() { + + int *A = (int*) malloc(32*sizeof(int)); + + #pragma omp target teams map(tofrom:A[0:32]) num_teams(1) + { + #pragma omp parallel num_threads(32) + { + #pragma omp simd + for(int i = 0; i < 32; i++) + A[i] = 1; + } + } + + for(int i = 0; i < 32; i++) + assert(A[i] == 1); + + free(A); + + printf("PASS\n"); +} +// CHECK: PASS +