diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -486,6 +486,114 @@ /// \param Loc The location where the taskyield directive was encountered. void createTaskyield(const LocationDescription &Loc); + /// Functions used to generate reductions. Such functions take two Values + /// representing LHS and RHS of the reduction, respectively, and a reference + /// to the value that is updated to refer to the reduction result. + using ReductionGenTy = + function_ref; + + /// Functions used to generate atomic reductions. Such functions take two + /// Values representing pointers to LHS and RHS of the reduction. They are + /// expected to atomically update the LHS to the reduced value. + using AtomicReductionGenTy = + function_ref; + + /// Information about an OpenMP reduction. + struct ReductionInfo { + /// Returns the type of the element being reduced. + Type *getElementType() const { + return Variable->getType()->getPointerElementType(); + } + + /// Reduction variable of pointer type. + Value *Variable; + + /// Thread-private partial reduction variable. + Value *PrivateVariable; + + /// Callback for generating the reduction body. + ReductionGenTy ReductionGen; + + /// Callback for generating the atomic reduction body, may be null. + AtomicReductionGenTy AtomicReductionGen; + }; + + /// Generator for '#omp reduction'. + /// + /// Emits the IR instructing the runtime to perform the specific kind of + /// reductions. Expects reduction variables to have been privatized and + /// initialized to reduction-neutral values separately. Emits the calls to + /// runtime functions as well as the reduction function and the basic blocks + /// performing the reduction atomically and non-atomically. + /// + /// The code emitted for the following: + /// + /// \code + /// type var_1; + /// type var_2; + /// #pragma omp reduction(reduction-op:var_1,var_2) + /// /* body */; + /// \endcode + /// + /// corresponds to the following sketch. + /// + /// \code + /// void _outlined_par() { + /// // N is the number of different reductions. + /// void *red_array[] = {privatized_var_1, privatized_var_2, ...}; + /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array, + /// _omp_reduction_func, + /// _gomp_critical_user.reduction.var)) { + /// case 1: { + /// var_1 = var_1 privatized_var_1; + /// var_2 = var_2 privatized_var_2; + /// // ... + /// __kmpc_end_reduce(...); + /// break; + /// } + /// case 2: { + /// _Atomic(var_1, privatized_var_1); + /// _Atomic(var_2, privatized_var_2); + /// // ... + /// break; + /// } + /// default: break; + /// } + /// } + /// + /// void _omp_reduction_func(void **lhs, void **rhs) { + /// *(type *)lhs[0] = *(type *)lhs[0] *(type *)rhs[0]; + /// *(type *)lhs[1] = *(type *)lhs[1] *(type *)rhs[1]; + /// // ... + /// } + /// \endcode + /// + /// \param Loc The location where the reduction was + /// encountered. Must be within the associate + /// directive and after the last local access to the + /// reduction variables. + /// \param AllocaIP An insertion point suitable for allocas usable + /// in reductions. + /// \param Variables A list of variables in which the reduction + /// results will be stored (values of pointer type). + /// \param PrivateVariables A list of variables in which the partial + /// reduction results are stored (values of pointer + /// type). Coindexed with Variables. Privatization + /// must be handled separately from this call. + /// \param ReductionGen A list of generators for non-atomic reduction + /// bodies. Each takes a pair of partially reduced + /// values and sets a new one. + /// \param AtomicReductionGen A list of generators for atomic reduction + /// bodies, empty if the reduction cannot be + /// performed with atomics. Each takes a pair of + /// _pointers_ to paritally reduced values and + /// atomically stores the result into the first. + /// \param IsNoWait A flag set if the reduction is marked as nowait. + InsertPointTy createReductions(const LocationDescription &Loc, + InsertPointTy AllocaIP, + ArrayRef ReductionInfos, + bool IsNoWait = false); + ///} /// Return the insertion point used by the underlying IRBuilder. diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -914,6 +914,7 @@ OMP_IDENT_FLAG(OMP_IDENT_FLAG_##Name, #Name, Value) __OMP_IDENT_FLAG(KMPC, 0x02) +__OMP_IDENT_FLAG(ATOMIC_REDUCE, 0x10) __OMP_IDENT_FLAG(BARRIER_EXPL, 0x20) __OMP_IDENT_FLAG(BARRIER_IMPL, 0x0040) __OMP_IDENT_FLAG(BARRIER_IMPL_MASK, 0x01C0) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1022,6 +1022,188 @@ /*IsCancellable*/ true); } +Function *getFreshReductionFunc(Module &M) { + constexpr StringRef NamePrefix = ".omp.reduction.func."; + SmallString<64> Name(NamePrefix); + unsigned Suffix = 0; + raw_svector_ostream NameStream(Name); + Type *VoidTy = Type::getVoidTy(M.getContext()); + Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); + do { + Name.resize(NamePrefix.size()); + NameStream << Suffix; + if (M.getNamedValue(NameStream.str())) + continue; + + FunctionCallee FC = + M.getOrInsertFunction(NameStream.str(), VoidTy, Int8PtrTy, Int8PtrTy); + Function *F = cast(FC.getCallee()); + F->setLinkage(llvm::GlobalValue::InternalLinkage); + return F; + } while (true); +} + +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( + const LocationDescription &Loc, InsertPointTy AllocaIP, + ArrayRef ReductionInfos, bool IsNoWait) { + for (const ReductionInfo &RI : ReductionInfos) { + (void)RI; + assert(RI.Variable && "expected non-null variable"); + assert(RI.PrivateVariable && "expected non-null private variable"); + assert(RI.ReductionGen && "expected non-null reduction generator callback"); + assert(RI.Variable->getType() == RI.PrivateVariable->getType() && + "expected variables and their private equivalents to have the same " + "type"); + assert(RI.Variable->getType()->isPointerTy() && + "expected variables to be pointers"); + } + + if (!updateToLocation(Loc)) + return InsertPointTy(); + + BasicBlock *InsertBlock = Loc.IP.getBlock(); + BasicBlock *ContinuationBlock = + InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); + InsertBlock->getTerminator()->eraseFromParent(); + + // Create and populate array of type-erased pointers to private reduction + // values. + unsigned NumReductions = ReductionInfos.size(); + Type *RedArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumReductions); + Builder.restoreIP(AllocaIP); + Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array"); + + Builder.SetInsertPoint(InsertBlock, InsertBlock->end()); + + for (auto En : enumerate(ReductionInfos)) { + unsigned Index = En.index(); + const ReductionInfo &RI = En.value(); + Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64( + RedArray, 0, Index, "red.array.elem." + Twine(Index)); + Value *Casted = + Builder.CreateBitCast(RI.PrivateVariable, Builder.getInt8PtrTy(), + "private.red.var." + Twine(Index) + ".casted"); + Builder.CreateStore(Casted, RedArrayElemPtr); + } + + // Emit a call to the runtime function that orchestrates the reduction. + // Declare the reduction function in the process. + Function *Func = Builder.GetInsertBlock()->getParent(); + Module *Module = Func->getParent(); + Value *RedArrayPtr = + Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr"); + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + bool CanGenerateAtomic = + llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) { + return RI.AtomicReductionGen; + }); + Value *Ident = getOrCreateIdent( + SrcLocStr, CanGenerateAtomic ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE + : IdentFlag(0)); + Value *ThreadId = getOrCreateThreadID(Ident); + Constant *NumVariables = Builder.getInt32(NumReductions); + const DataLayout &DL = Module->getDataLayout(); + unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy); + Constant *RedArraySize = Builder.getInt64(RedArrayByteSize); + Function *ReductionFunc = getFreshReductionFunc(*Module); + Value *Lock = getOMPCriticalRegionLock(".reduction"); + Function *ReduceFunc = getOrCreateRuntimeFunctionPtr( + IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait + : RuntimeFunction::OMPRTL___kmpc_reduce); + CallInst *ReduceCall = + Builder.CreateCall(ReduceFunc, + {Ident, ThreadId, NumVariables, RedArraySize, + RedArrayPtr, ReductionFunc, Lock}, + "reduce"); + + // Create final reduction entry blocks for the atomic and non-atomic case. + // Emit IR that dispatches control flow to one of the blocks based on the + // reduction supporting the atomic mode. + BasicBlock *NonAtomicRedBlock = + BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func); + BasicBlock *AtomicRedBlock = + BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func); + SwitchInst *Switch = + Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2); + Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock); + Switch->addCase(Builder.getInt32(2), AtomicRedBlock); + + // Populate the non-atomic reduction using the elementwise reduction function. + // This loads the elements from the global and private variables and reduces + // them before storing back the result to the global variable. + Builder.SetInsertPoint(NonAtomicRedBlock); + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Type *ValueType = RI.getElementType(); + Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable, + "red.value." + Twine(En.index())); + Value *PrivateRedValue = + Builder.CreateLoad(ValueType, RI.PrivateVariable, + "red.private.value." + Twine(En.index())); + Value *Reduced; + Builder.restoreIP( + RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced)); + if (!Builder.GetInsertBlock()) + return InsertPointTy(); + Builder.CreateStore(Reduced, RI.Variable); + } + Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr( + IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait + : RuntimeFunction::OMPRTL___kmpc_end_reduce); + Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock}); + Builder.CreateBr(ContinuationBlock); + + // Populate the atomic reduction using the atomic elementwise reduction + // function. There are no loads/stores here because they will be happening + // inside the atomic elementwise reduction. + Builder.SetInsertPoint(AtomicRedBlock); + if (CanGenerateAtomic) { + for (const ReductionInfo &RI : ReductionInfos) { + Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.Variable, + RI.PrivateVariable)); + if (!Builder.GetInsertBlock()) + return InsertPointTy(); + } + Builder.CreateBr(ContinuationBlock); + } else { + Builder.CreateUnreachable(); + } + + // Populate the outlined reduction function using the elementwise reduction + // function. Partial values are extracted from the type-erased array of + // pointers to private variables. + BasicBlock *ReductionFuncBlock = + BasicBlock::Create(Module->getContext(), "", ReductionFunc); + Builder.SetInsertPoint(ReductionFuncBlock); + Value *LHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(0), + RedArrayTy->getPointerTo()); + Value *RHSArrayPtr = Builder.CreateBitCast(ReductionFunc->getArg(1), + RedArrayTy->getPointerTo()); + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *LHSI8PtrPtr = + Builder.CreateConstInBoundsGEP2_64(LHSArrayPtr, 0, En.index()); + Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr); + Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); + Value *LHS = Builder.CreateLoad(RI.getElementType(), LHSPtr); + Value *RHSI8PtrPtr = + Builder.CreateConstInBoundsGEP2_64(RHSArrayPtr, 0, En.index()); + Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr); + Value *RHSPtr = + Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); + Value *RHS = Builder.CreateLoad(RI.getElementType(), RHSPtr); + Value *Reduced; + Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced)); + if (!Builder.GetInsertBlock()) + return InsertPointTy(); + Builder.CreateStore(Reduced, LHSPtr); + } + Builder.CreateRetVoid(); + + Builder.SetInsertPoint(ContinuationBlock); + return Builder.saveIP(); +} + OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -154,14 +154,14 @@ public ::testing::WithParamInterface {}; // Returns the value stored in the given allocation. Returns null if the given -// value is not a result of an allocation, if no value is stored or if there is -// more than one store. -static Value *findStoredValue(Value *AllocaValue) { - Instruction *Alloca = dyn_cast(AllocaValue); - if (!Alloca) +// value is not a result of an InstTy instruction, if no value is stored or if +// there is more than one store. +template static Value *findStoredValue(Value *AllocaValue) { + Instruction *Inst = dyn_cast(AllocaValue); + if (!Inst) return nullptr; StoreInst *Store = nullptr; - for (Use &U : Alloca->uses()) { + for (Use &U : Inst->uses()) { if (auto *CandidateStore = dyn_cast(U.getUser())) { EXPECT_EQ(Store, nullptr); Store = CandidateStore; @@ -545,7 +545,8 @@ EXPECT_EQ(ForkCI->getArgOperand(1), ConstantInt::get(Type::getInt32Ty(Ctx), 1U)); EXPECT_EQ(ForkCI->getArgOperand(2), Usr); - EXPECT_EQ(findStoredValue(ForkCI->getArgOperand(3)), F->arg_begin()); + EXPECT_EQ(findStoredValue(ForkCI->getArgOperand(3)), + F->arg_begin()); } TEST_F(OpenMPIRBuilderTest, ParallelNested) { @@ -860,14 +861,15 @@ EXPECT_TRUE(isa(ForkCI->getArgOperand(0))); EXPECT_EQ(ForkCI->getArgOperand(1), ConstantInt::get(Type::getInt32Ty(Ctx), 1)); - Value *StoredForkArg = findStoredValue(ForkCI->getArgOperand(3)); + Value *StoredForkArg = findStoredValue(ForkCI->getArgOperand(3)); EXPECT_EQ(StoredForkArg, F->arg_begin()); EXPECT_EQ(DirectCI->getCalledFunction(), OutlinedFn); EXPECT_EQ(DirectCI->getNumArgOperands(), 3U); EXPECT_TRUE(isa(DirectCI->getArgOperand(0))); EXPECT_TRUE(isa(DirectCI->getArgOperand(1))); - Value *StoredDirectArg = findStoredValue(DirectCI->getArgOperand(2)); + Value *StoredDirectArg = + findStoredValue(DirectCI->getArgOperand(2)); EXPECT_EQ(StoredDirectArg, F->arg_begin()); } @@ -2517,6 +2519,353 @@ EXPECT_FALSE(verifyModule(*M, &errs())); } +/// Returns the single instruction of InstTy type in BB that uses the value V. +/// If there is more than one such instruction, returns null. +template +static InstTy *findSingleUserInBlock(Value *V, BasicBlock *BB) { + InstTy *Result = nullptr; + for (User *U : V->users()) { + auto *Inst = dyn_cast(U); + if (!Inst || Inst->getParent() != BB) + continue; + if (Result) + return nullptr; + Result = Inst; + } + return Result; +} + +/// Returns true if BB contains a simple binary reduction that loads a value +/// from Accum, performs some binary operation with it, and stores it back to +/// Accum. +static bool isSimpleBinaryReduction(Value *Accum, BasicBlock *BB) { + StoreInst *Store = findSingleUserInBlock(Accum, BB); + if (!Store) + return false; + auto *Stored = dyn_cast(Store->getOperand(0)); + if (!Stored) + return false; + auto *Load = dyn_cast(Stored->getOperand(0)); + return Load && Load->getOperand(0) == Accum; +} + +/// Returns true if BB contains a binary reduction that reduces V using a binary +/// operator into an accumulator that is a function argument. +static bool isValueReducedToFuncArg(Value *V, BasicBlock *BB) { + auto *ReductionOp = findSingleUserInBlock(V, BB); + if (!ReductionOp) + return false; + + auto *GlobalLoad = dyn_cast(ReductionOp->getOperand(0)); + if (!GlobalLoad) + return false; + + auto *Store = findSingleUserInBlock(ReductionOp, BB); + if (!Store) + return false; + + return Store->getPointerOperand() == GlobalLoad->getPointerOperand() && + isa(GlobalLoad->getPointerOperand()); +} + +/// Finds among users of Ptr a pair of GEP instructions with indices [0, 0] and +/// [0, 1], respectively, and assigns results of these instructions to Zero and +/// One. Returns true on success, false on failure or if such instructions are +/// not unique among the users of Ptr. +static bool findGEPZeroOne(Value *Ptr, Value *&Zero, Value *&One) { + Zero = nullptr; + One = nullptr; + for (User *U : Ptr->users()) { + if (auto *GEP = dyn_cast(U)) { + if (GEP->getNumIndices() != 2) + continue; + auto *FirstIdx = dyn_cast(GEP->getOperand(1)); + auto *SecondIdx = dyn_cast(GEP->getOperand(2)); + EXPECT_NE(FirstIdx, nullptr); + EXPECT_NE(SecondIdx, nullptr); + + EXPECT_TRUE(FirstIdx->isZero()); + if (SecondIdx->isZero()) { + if (Zero) + return false; + Zero = GEP; + } else if (SecondIdx->isOne()) { + if (One) + return false; + One = GEP; + } else { + return false; + } + } + } + return Zero != nullptr && One != nullptr; +} + +TEST_F(OpenMPIRBuilderTest, CreateReductions) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + // Create variables to be reduced. + InsertPointTy OuterAllocaIP(&F->getEntryBlock(), + F->getEntryBlock().getFirstInsertionPt()); + Value *SumReduced; + Value *XorReduced; + { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(OuterAllocaIP); + SumReduced = Builder.CreateAlloca(Builder.getFloatTy()); + XorReduced = Builder.CreateAlloca(Builder.getInt32Ty()); + } + + // Store initial values of reductions into global variables. + Builder.CreateStore(ConstantFP::get(Builder.getFloatTy(), 0.0), SumReduced); + Builder.CreateStore(Builder.getInt32(1), XorReduced); + + // The loop body computes two reductions: + // sum of (float) thread-id; + // product of (thread-id + 1); + // and store the result in global variables. + InsertPointTy BodyIP, BodyAllocaIP; + auto BodyGenCB = [&](InsertPointTy InnerAllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationBB) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(CodeGenIP); + + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc); + Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr); + Value *TID = OMPBuilder.getOrCreateThreadID(Ident); + Value *SumLocal = + Builder.CreateUIToFP(TID, Builder.getFloatTy(), "sum.local"); + Value *ProdLocal = + Builder.CreateAdd(TID, Builder.getInt32(1), "prod.local"); + Value *SumPartial = Builder.CreateLoad(SumReduced, "sum.partial"); + Value *ProdPartial = Builder.CreateLoad(XorReduced, "prod.partial"); + Value *Sum = Builder.CreateFAdd(SumPartial, SumLocal, "sum"); + Value *Prod = Builder.CreateMul(ProdPartial, ProdLocal, "prod"); + Builder.CreateStore(Sum, SumReduced); + Builder.CreateStore(Prod, XorReduced); + + BodyIP = Builder.saveIP(); + BodyAllocaIP = InnerAllocaIP; + }; + + // Privatization for reduction creates local copies of reduction variables and + // initializes them to reduction-neutral values. + Value *SumPrivatized; + Value *XorPrivatized; + auto PrivCB = [&](InsertPointTy InnerAllocaIP, InsertPointTy CodeGenIP, + Value &Original, Value &Inner, Value *&ReplVal) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(InnerAllocaIP); + if (&Original == SumReduced) { + SumPrivatized = Builder.CreateAlloca(Builder.getFloatTy()); + ReplVal = SumPrivatized; + } else if (&Original == XorReduced) { + XorPrivatized = Builder.CreateAlloca(Builder.getInt32Ty()); + ReplVal = XorPrivatized; + } else { + ReplVal = &Inner; + return CodeGenIP; + } + + Builder.restoreIP(CodeGenIP); + if (&Original == SumReduced) + Builder.CreateStore(ConstantFP::get(Builder.getFloatTy(), 0.0), + SumPrivatized); + else if (&Original == XorReduced) + Builder.CreateStore(Builder.getInt32(0), XorPrivatized); + + return Builder.saveIP(); + }; + + // Do nothing in finalization. + auto FiniCB = [&](InsertPointTy CodeGenIP) { return CodeGenIP; }; + + InsertPointTy AfterIP = + OMPBuilder.createParallel(Loc, OuterAllocaIP, BodyGenCB, PrivCB, FiniCB, + /* IfCondition */ nullptr, + /* NumThreads */ nullptr, OMP_PROC_BIND_default, + /* IsCancellable */ false); + Builder.restoreIP(AfterIP); + + auto SumReduction = [&](InsertPointTy IP, Value *LHS, Value *RHS, + Value *&Result) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(IP); + Result = Builder.CreateFAdd(LHS, RHS, "red.add"); + return Builder.saveIP(); + }; + + auto SumAtomicReduction = [&](InsertPointTy IP, Value *LHS, Value *RHS) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(IP); + Value *Partial = Builder.CreateLoad(RHS, "red.partial"); + Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, LHS, Partial, None, + AtomicOrdering::Monotonic); + return Builder.saveIP(); + }; + + auto XorReduction = [&](InsertPointTy IP, Value *LHS, Value *RHS, + Value *&Result) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(IP); + Result = Builder.CreateXor(LHS, RHS, "red.xor"); + return Builder.saveIP(); + }; + + auto XorAtomicReduction = [&](InsertPointTy IP, Value *LHS, Value *RHS) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.restoreIP(IP); + Value *Partial = Builder.CreateLoad(RHS, "red.partial"); + Builder.CreateAtomicRMW(AtomicRMWInst::Xor, LHS, Partial, None, + AtomicOrdering::Monotonic); + return Builder.saveIP(); + }; + + OpenMPIRBuilder::ReductionInfo ReductionInfos[] = { + {SumReduced, SumPrivatized, SumReduction, SumAtomicReduction}, + {XorReduced, XorPrivatized, XorReduction, XorAtomicReduction}}; + + OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos); + + Builder.restoreIP(AfterIP); + Builder.CreateRetVoid(); + + OMPBuilder.finalize(F); + + // The IR must be valid. + EXPECT_FALSE(verifyModule(*M)); + + // Outlining must have happened. + Function *Outlined = M->getFunction("func..omp_par"); + EXPECT_NE(Outlined, nullptr); + + // Check that the lock variable was created with the expected name. + GlobalVariable *LockVar = + M->getGlobalVariable(".gomp_critical_user_.reduction.var"); + EXPECT_NE(LockVar, nullptr); + + // Find the allocation of a local array that will be used to call the runtime + // reduciton function. + BasicBlock &AllocBlock = Outlined->getEntryBlock(); + Value *LocalArray = nullptr; + for (Instruction &I : AllocBlock) { + if (AllocaInst *Alloc = dyn_cast(&I)) { + if (!Alloc->getAllocatedType()->isArrayTy() || + !Alloc->getAllocatedType()->getArrayElementType()->isPointerTy()) + continue; + LocalArray = Alloc; + break; + } + } + ASSERT_NE(LocalArray, nullptr); + + // Find the call to the runtime reduction function. + BasicBlock *BB = AllocBlock.getUniqueSuccessor(); + Value *LocalArrayPtr = nullptr; + Value *ReductionFnVal = nullptr; + Value *SwitchArg = nullptr; + for (Instruction &I : *BB) { + if (CallInst *Call = dyn_cast(&I)) { + if (Call->getCalledFunction() != + OMPBuilder.getOrCreateRuntimeFunctionPtr( + RuntimeFunction::OMPRTL___kmpc_reduce)) + continue; + LocalArrayPtr = Call->getOperand(4); + ReductionFnVal = Call->getOperand(5); + SwitchArg = Call; + break; + } + } + + // Check that the local array is passed to the function. + ASSERT_NE(LocalArrayPtr, nullptr); + BitCastInst *BitCast = dyn_cast(LocalArrayPtr); + ASSERT_NE(BitCast, nullptr); + EXPECT_EQ(BitCast->getOperand(0), LocalArray); + + // Find the GEP instructions preceding stores to the local array. + Value *FirstArrayElemPtr = nullptr; + Value *SecondArrayElemPtr = nullptr; + EXPECT_EQ(LocalArray->getNumUses(), 3u); + ASSERT_TRUE( + findGEPZeroOne(LocalArray, FirstArrayElemPtr, SecondArrayElemPtr)); + + // Check that the values stored into the local array are privatized reduction + // variables. + auto *FirstStored = dyn_cast_or_null( + findStoredValue(FirstArrayElemPtr)); + auto *SecondStored = dyn_cast_or_null( + findStoredValue(SecondArrayElemPtr)); + ASSERT_NE(FirstStored, nullptr); + ASSERT_NE(SecondStored, nullptr); + Value *FirstPrivatized = FirstStored->getOperand(0); + Value *SecondPrivatized = SecondStored->getOperand(0); + EXPECT_TRUE( + isSimpleBinaryReduction(FirstPrivatized, FirstStored->getParent())); + EXPECT_TRUE( + isSimpleBinaryReduction(SecondPrivatized, SecondStored->getParent())); + + // Check that the result of the runtime reduction call is used for further + // dispatch. + ASSERT_EQ(SwitchArg->getNumUses(), 1u); + SwitchInst *Switch = dyn_cast(*SwitchArg->user_begin()); + ASSERT_NE(Switch, nullptr); + EXPECT_EQ(Switch->getNumSuccessors(), 3u); + BasicBlock *NonAtomicBB = Switch->case_begin()->getCaseSuccessor(); + BasicBlock *AtomicBB = std::next(Switch->case_begin())->getCaseSuccessor(); + + // Non-atomic block contains reductions to the global reduction variable, + // which is passed into the outlined function as an argument. + Value *FirstLoad = + findSingleUserInBlock(FirstPrivatized, NonAtomicBB); + Value *SecondLoad = + findSingleUserInBlock(SecondPrivatized, NonAtomicBB); + EXPECT_TRUE(isValueReducedToFuncArg(FirstLoad, NonAtomicBB)); + EXPECT_TRUE(isValueReducedToFuncArg(SecondLoad, NonAtomicBB)); + + // Atomic block also constains reductions to the global reduction variable. + FirstLoad = findSingleUserInBlock(FirstPrivatized, AtomicBB); + SecondLoad = findSingleUserInBlock(SecondPrivatized, AtomicBB); + auto *FirstAtomic = findSingleUserInBlock(FirstLoad, AtomicBB); + auto *SecondAtomic = + findSingleUserInBlock(SecondLoad, AtomicBB); + ASSERT_NE(FirstAtomic, nullptr); + EXPECT_TRUE(isa(FirstAtomic->getPointerOperand())); + ASSERT_NE(SecondAtomic, nullptr); + EXPECT_TRUE(isa(SecondAtomic->getPointerOperand())); + + // Check that the separate reduction function also performs (non-atomic) + // reductions after extracting reduction variables from its arguments. + Function *ReductionFn = cast(ReductionFnVal); + BasicBlock *FnReductionBB = &ReductionFn->getEntryBlock(); + auto *Bitcast = + findSingleUserInBlock(ReductionFn->getArg(0), FnReductionBB); + Value *FirstLHSPtr; + Value *SecondLHSPtr; + ASSERT_TRUE(findGEPZeroOne(Bitcast, FirstLHSPtr, SecondLHSPtr)); + Value *Opaque = findSingleUserInBlock(FirstLHSPtr, FnReductionBB); + ASSERT_NE(Opaque, nullptr); + Bitcast = findSingleUserInBlock(Opaque, FnReductionBB); + ASSERT_NE(Bitcast, nullptr); + EXPECT_TRUE(isSimpleBinaryReduction(Bitcast, FnReductionBB)); + Opaque = findSingleUserInBlock(SecondLHSPtr, FnReductionBB); + ASSERT_NE(Opaque, nullptr); + Bitcast = findSingleUserInBlock(Opaque, FnReductionBB); + ASSERT_NE(Bitcast, nullptr); + EXPECT_TRUE(isSimpleBinaryReduction(Bitcast, FnReductionBB)); + + Bitcast = + findSingleUserInBlock(ReductionFn->getArg(1), FnReductionBB); + Value *FirstRHS; + Value *SecondRHS; + EXPECT_TRUE(findGEPZeroOne(Bitcast, FirstRHS, SecondRHS)); +} + TEST_F(OpenMPIRBuilderTest, CreateSections) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; using BodyGenCallbackTy = llvm::OpenMPIRBuilder::StorableBodyGenCallbackTy;