Index: include/polly/CodeGen/LoopGenerators.h =================================================================== --- include/polly/CodeGen/LoopGenerators.h +++ include/polly/CodeGen/LoopGenerators.h @@ -13,10 +13,11 @@ //===----------------------------------------------------------------------===// #ifndef POLLY_LOOP_GENERATORS_H #define POLLY_LOOP_GENERATORS_H + #include "polly/CodeGen/IRBuilder.h" -#include "llvm/ADT/SetVector.h" -#include +#include "llvm/IR/ValueMap.h" +#include "llvm/ADT/SetVector.h" namespace llvm { class Value; @@ -50,73 +51,165 @@ ICmpInst::Predicate Predicate, LoopAnnotator *Annotator = NULL, bool Parallel = false); -class OMPGenerator { +/// @brief The ParallelLoopGenerator allows to create parallelized loops +/// +/// To parallelize a loop, we perform the following steps: +/// o Generate a subfunction which will hold the loop body. +/// o Create a struct to hold all outer values needed in the loop body. +/// o Create calls to a runtime library to achieve the actual parallelism. +/// These calls will spawn and join threads, define how the work (here the +/// iterations) are distributed between them and make sure each has access +/// to the struct holding all needed values. +/// +/// At the moment we support two runtimes, OpenMP and Polly-Threads (a pthread +/// wrapper). +/// +/// If we parallelize the outer loop of the following loop nest, +/// +/// S0; +/// for (int i = 0; i < N; i++) +/// for (int j = 0; j < M; j++) +/// S1(i, j); +/// S2; +/// +/// we will generate the following code (with different runtime function names): +/// +/// S0; +/// auto *values = storeValuesIntoStruct(); +/// // Execute subfunction with multiple threads +/// spawn_threads(subfunction, values); +/// join_threads(); +/// S2; +/// +/// // This function is executed in parallel by different threads +/// void subfunction(values) { +/// while (auto *WorkItem = getWorkItem()) { +/// int LB = WorkItem.begin(); +/// int UB = WorkItem.end(); +/// for (int i = LB; i < UB; i++) +/// for (int j = 0; j < M; j++) +/// S1(i, j); +/// } +/// cleanup_thread(); +/// } +class ParallelLoopGenerator { + + /// @brief The IR builder we use to create instructions. + PollyIRBuilder &Builder; + + /// @brief A pass pointer to update analysis information. + Pass *P; + + /// @brief The loop info of the current function we need to update. + LoopInfo &LI; + + /// @brief The dominace tree of the current function we need to update. + DominatorTree &DT; + + /// @brief The type we use for induction variable related instructions. + Type *IVType; + + /// @brief The current module + Module *M; + public: - typedef std::map ValueToValueMapTy; + using ValueToValueMapTy = llvm::ValueMap; - OMPGenerator(PollyIRBuilder &Builder, Pass *P) : Builder(Builder), P(P) {} + /// @brief Create a parallel loop generator for the current function + ParallelLoopGenerator(PollyIRBuilder &Builder, Pass *P, LoopInfo &LI, + DominatorTree &DT, Type *IVType) + : Builder(Builder), P(P), LI(LI), DT(DT), IVType(IVType), M(getModule()) { + } - /// @brief Create an OpenMP parallel loop. - /// + /// @brief Create a parallel loop /// - /// @param LowerBound The starting value of the induction variable. - /// @param UpperBound The upper bound of the induction variable. - /// @param Stride The value by which the induction variable is - /// incremented. /// - /// @param UsedValues A set of LLVM-IR Values that should be available to - /// the new loop body. - /// @param VMap This map is filled by createParallelLoop(). It - /// maps the values in UsedValues to Values through which - /// their content is available within the loop body. - /// @param LoopBody A pointer to an iterator that is set to point to the - /// body of the created loop. It should be used to insert - /// instructions that form the actual loop body. + /// @param LB The lower bound for the loop we parallelize + /// @param UB The upper bound for the loop we parallelize + /// @param Stride The stride of the loop we parallelize + /// @param Values A set of LLVM-IR Values that should be available in + /// the new loop body. + /// @param VMap A map to allow outside access to the new versions of + /// the values in @p Values. + /// @param LoopBody A pointer to an iterator that is set to point to the + /// body of the created loop. It should be used to insert + /// instructions that form the actual loop body. + /// @param Predicate The predicate for loop for the loop guard /// - /// @return Value* The newly created induction variable for this loop. - Value *createParallelLoop(Value *LowerBound, Value *UpperBound, Value *Stride, - SetVector &UsedValues, - ValueToValueMapTy &VMap, - BasicBlock::iterator *LoopBody); + /// @return The newly created induction variable for this loop. + Value *createParallelLoop(Value *LB, Value *UB, Value *Stride, + SetVector &Values, ValueToValueMapTy &VMap, + BasicBlock::iterator *LoopBody, + ICmpInst::Predicate Predicate = ICmpInst::ICMP_SLE); private: - PollyIRBuilder &Builder; - Pass *P; - - IntegerType *getIntPtrTy(); + /// @brief Return the current module Module *getModule(); - void createCallParallelLoopStart(Value *SubFunction, Value *SubfunctionParam, - Value *NumberOfThreads, Value *LowerBound, - Value *UpperBound, Value *Stride); - Value *createCallLoopNext(Value *LowerBoundPtr, Value *UpperBoundPtr); - void createCallParallelEnd(); - void createCallLoopEndNowait(); + /// @brief Return the current module + Function *getFunction(); - Value *loadValuesIntoStruct(SetVector &Values); - void extractValuesFromStruct(SetVector OldValues, Value *Struct, - ValueToValueMapTy &Map); + /// @brief Create a runtime library call to spawn the worker threads + /// + /// @param SubFn The subfunction which holds the loop body + /// @param SubFnParam The parameter for the subfunction (basically the struct + /// filled with the outside values) + /// @param LB The lower bound for the loop we parallelize + /// @param UB The upper bound for the loop we parallelize + /// @param Stride The stride of the loop we parallelize + void createCallSpawnThreads(Value *SubFn, Value *SubFnParam, Value *LB, + Value *UB, Value *Stride); + + /// @brief Create a runtime library call to join the worker threads + void createCallJoinThreads(); - /// @brief Create the OpenMP subfunction. + /// @brief Create a runtime library call to get the next work item /// - /// @param Stride The value by which the induction variable is - /// incremented. - /// @param Struct The structure that is used to make Values available to - /// the loop body. - /// @param UsedValues A set of LLVM-IR Values that should be available to - /// the new loop body. - /// @param VMap This map that is filled by createSubfunction(). It - /// maps the values in UsedValues to Values through which - /// their content is available within the loop body. - /// @param SubFunction The newly created SubFunction is returned here. + /// @param LBPtr A pointer value to store the work item begin in + /// @param UBPtr A pointer value to store the work item end in + /// + /// @returns A true value if the work item is not empty + Value *createCallGetWorkItem(Value *LBPtr, Value *UBPtr); + + /// @brief Create a runtime library call to allow cleanup of the thread /// - /// @return Value* The newly created induction variable. - Value *createSubfunction(Value *Stride, Value *Struct, - SetVector UsedValues, - ValueToValueMapTy &VMap, Function **SubFunction); + /// @note This function is called right before the thread will exit the + /// subfunction and only if the runtime system depends depends on it. + void createCallCleanupThread(); - /// @brief Create the definition of the OpenMP subfunction. - Function *createSubfunctionDefinition(); + /// @brief Create a struct for all @p Values and store them in there + /// + /// @param Values The values which should be stored in the struct + /// + /// @return The created struct + Value *storeValuesIntoStruct(SetVector &Values); + + /// @brief Extract all values from the @p Struct and construct the mapping + /// + /// @param Values The values which were stored in the struct + /// @param Struct The struct holding all the values in @p Values + /// @param VMap A map to associate every element of @p Values with the + /// new llvm value loaded from the @p Struct. + void extractValuesFromStruct(SetVector Values, Value *Struct, + ValueToValueMapTy &VMap); + + /// @brief Create the definition of the parallel subfunction + Function *createSubFnDefinition(); + + /// @brief Create the parallel subfunction + /// + /// @param Stride The induction variable increment + /// @param Struct A struct holding all values in @p Values + /// @param Values A set of LLVM-IR Values that should be available in + /// the new loop body. + /// @param VMap A map to allow outside access to the new versions of + /// the values in @p Values. + /// @param SubFn The newly created subfunction is returned here. + /// + /// @return The newly created induction variable + Value *createSubFn(Value *Stride, Value *Struct, + SetVector UsedValues, ValueToValueMapTy &VMap, + Function **SubFn, ICmpInst::Predicate Predicate); }; } // end namespace polly #endif Index: lib/CodeGen/CodeGeneration.cpp =================================================================== --- lib/CodeGen/CodeGeneration.cpp +++ lib/CodeGen/CodeGeneration.cpp @@ -316,7 +316,7 @@ /// @brief Update ClastVars and ValueMap according to a value map. /// /// @param VMap A map from old to new values. - void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); + void updateWithValueMap(ParallelLoopGenerator::ValueToValueMapTy &VMap); /// @brief Create an OpenMP parallel for loop. /// @@ -583,8 +583,8 @@ return Values; } -void -ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) { +void ClastStmtCodeGen::updateWithValueMap( + ParallelLoopGenerator::ValueToValueMapTy &VMap) { std::set Inserted; for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; @@ -593,8 +593,8 @@ Inserted.insert(I->second); } - for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), - E = VMap.end(); + for (ParallelLoopGenerator::ValueToValueMapTy::iterator I = VMap.begin(), + E = VMap.end(); I != E; ++I) { if (Inserted.count(I->first)) continue; @@ -619,8 +619,8 @@ BasicBlock::iterator LoopBody; IntegerType *IntPtrTy = getIntPtrTy(); SetVector Values; - OMPGenerator::ValueToValueMapTy VMap; - OMPGenerator OMPGen(Builder, P); + ParallelLoopGenerator::ValueToValueMapTy VMap; + ParallelLoopGenerator OMPGen(Builder, P, LI, DT, IntPtrTy); Stride = Builder.getInt(APInt_from_MPZ(For->stride)); Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); Index: lib/CodeGen/LoopGenerators.cpp =================================================================== --- lib/CodeGen/LoopGenerators.cpp +++ lib/CodeGen/LoopGenerators.cpp @@ -7,8 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file contains functions to create scalar and OpenMP parallel loops -// as LLVM-IR. +// This file contains functions to create scalar and parallel loops as LLVM-IR. // //===----------------------------------------------------------------------===// @@ -19,10 +18,16 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; using namespace polly; +static cl::opt + PollyNumThreads("polly-num-threads", + cl::desc("Number of threads to use (0 = auto)"), cl::Hidden, + cl::init(0)); + // We generate a loop of the following structure // // BeforeBB @@ -127,16 +132,55 @@ return IV; } -void OMPGenerator::createCallParallelLoopStart( - Value *SubFunction, Value *SubfunctionParam, Value *NumberOfThreads, - Value *LowerBound, Value *UpperBound, Value *Stride) { - Module *M = getModule(); - const char *Name = "GOMP_parallel_loop_runtime_start"; +Value *ParallelLoopGenerator::createParallelLoop( + Value *LB, Value *UB, Value *Stride, SetVector &UsedValues, + ValueToValueMapTy &Map, BasicBlock::iterator *LoopBody, + ICmpInst::Predicate Predicate) { + Value *Struct, *IV, *SubFnParam; + Function *SubFn; + + assert(Stride->getType() == IVType && "Type missmatch for IV type"); + assert(LB->getType() == IVType && "Type missmatch for IV type"); + assert(UB->getType() == IVType && "Type missmatch for IV type"); + + Struct = storeValuesIntoStruct(UsedValues); + + BasicBlock::iterator BeforeLoop = Builder.GetInsertPoint(); + IV = createSubFn(Stride, Struct, UsedValues, Map, &SubFn, Predicate); + *LoopBody = Builder.GetInsertPoint(); + Builder.SetInsertPoint(BeforeLoop); + + SubFnParam = Builder.CreateBitCast(Struct, Builder.getInt8PtrTy(), + "polly.par.userContext"); + + // Add one as the upper bound provided by openmp is a < comparison + // whereas the codegenForSequential function creates a <= comparison. + if (Predicate == ICmpInst::ICMP_SLE) + UB = Builder.CreateAdd(UB, ConstantInt::get(IVType, 1)); + + // Tell the runtime we start a parallel loop + createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride); + Builder.CreateCall(SubFn, SubFnParam); + createCallJoinThreads(); + + // Mark the end of the lifetime for the parameter struct + Type *Ty = Struct->getType(); + ConstantInt *SizeOf = dyn_cast(ConstantExpr::getSizeOf(Ty)); + Builder.CreateLifetimeEnd(Struct, SizeOf); + + return IV; +} + +void ParallelLoopGenerator::createCallSpawnThreads(Value *SubFn, + Value *SubFnParam, Value *LB, + Value *UB, Value *Stride) { + const std::string Name = "GOMP_parallel_loop_runtime_start"; + Function *F = M->getFunction(Name); // If F is not available, declare it. if (!F) { - Type *LongTy = getIntPtrTy(); + Type *LongTy = IVType; GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; Type *Params[] = {PointerType::getUnqual(FunctionType::get( @@ -148,21 +192,21 @@ F = Function::Create(Ty, Linkage, Name, M); } - Value *Args[] = {SubFunction, SubfunctionParam, NumberOfThreads, - LowerBound, UpperBound, Stride}; + Value *NumberOfThreads = Builder.getInt32(PollyNumThreads); + Value *Args[] = {SubFn, SubFnParam, NumberOfThreads, LB, UB, Stride}; Builder.CreateCall(F, Args); } -Value *OMPGenerator::createCallLoopNext(Value *LowerBoundPtr, - Value *UpperBoundPtr) { - Module *M = getModule(); - const char *Name = "GOMP_loop_runtime_next"; +Value *ParallelLoopGenerator::createCallGetWorkItem(Value *LBPtr, + Value *UBPtr) { + const std::string Name = "GOMP_loop_runtime_next"; + Function *F = M->getFunction(Name); // If F is not available, declare it. if (!F) { - Type *LongPtrTy = PointerType::getUnqual(getIntPtrTy()); + Type *LongPtrTy = PointerType::getUnqual(IVType); GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; Type *Params[] = {LongPtrTy, LongPtrTy}; @@ -171,17 +215,16 @@ F = Function::Create(Ty, Linkage, Name, M); } - Value *Args[] = {LowerBoundPtr, UpperBoundPtr}; - + Value *Args[] = {LBPtr, UBPtr}; Value *Return = Builder.CreateCall(F, Args); Return = Builder.CreateICmpNE( Return, Builder.CreateZExt(Builder.getFalse(), Return->getType())); return Return; } -void OMPGenerator::createCallParallelEnd() { - const char *Name = "GOMP_parallel_end"; - Module *M = getModule(); +void ParallelLoopGenerator::createCallJoinThreads() { + const std::string Name = "GOMP_parallel_end"; + Function *F = M->getFunction(Name); // If F is not available, declare it. @@ -195,9 +238,9 @@ Builder.CreateCall(F); } -void OMPGenerator::createCallLoopEndNowait() { - const char *Name = "GOMP_loop_end_nowait"; - Module *M = getModule(); +void ParallelLoopGenerator::createCallCleanupThread() { + const std::string Name = "GOMP_loop_end_nowait"; + Function *F = M->getFunction(Name); // If F is not available, declare it. @@ -211,39 +254,49 @@ Builder.CreateCall(F); } -IntegerType *OMPGenerator::getIntPtrTy() { - return P->getAnalysis().getDataLayout().getIntPtrType( - Builder.getContext()); -} - -Module *OMPGenerator::getModule() { +Module *ParallelLoopGenerator::getModule() { return Builder.GetInsertBlock()->getParent()->getParent(); } -Function *OMPGenerator::createSubfunctionDefinition() { - Module *M = getModule(); +Function *ParallelLoopGenerator::createSubFnDefinition() { Function *F = Builder.GetInsertBlock()->getParent(); std::vector Arguments(1, Builder.getInt8PtrTy()); FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); - Function *FN = Function::Create(FT, Function::InternalLinkage, - F->getName() + ".omp_subfn", M); + Function *SubFn = Function::Create(FT, Function::InternalLinkage, + F->getName() + ".polly.subfn", M); + // Do not run any polly pass on the new function. - FN->addFnAttr(PollySkipFnAttr); + SubFn->addFnAttr(PollySkipFnAttr); - Function::arg_iterator AI = FN->arg_begin(); - AI->setName("omp.userContext"); + Function::arg_iterator AI = SubFn->arg_begin(); + AI->setName("polly.par.userContext"); - return FN; + return SubFn; } -Value *OMPGenerator::loadValuesIntoStruct(SetVector &Values) { - std::vector Members; +Value * +ParallelLoopGenerator::storeValuesIntoStruct(SetVector &Values) { + SmallVector Members; for (Value *V : Values) Members.push_back(V->getType()); + // We do not want to allocate the alloca inside any loop, thus we allocate it + // in the entry block of the function and use annotations to denote the actual + // live span (similar to clang). + // + // XXX: It might be necessary to allocate the struct explicitly in front of + // the outermost loop surrounding the current insertion block in order + // to prevent us from allocating stack space without actually using it. + // For now we assume the annotations suffice to sink the allocas to the + // most recent dominating basic block which is not part of a loop. + BasicBlock &EntryBB = Builder.GetInsertBlock()->getParent()->getEntryBlock(); + Instruction *IP = EntryBB.getFirstInsertionPt(); StructType *Ty = StructType::get(Builder.getContext(), Members); - Value *Struct = Builder.CreateAlloca(Ty, 0, "omp.userContext"); + Value *Struct = new AllocaInst(Ty, 0, "polly.par.userContext", IP); + + ConstantInt *SizeOf = dyn_cast(ConstantExpr::getSizeOf(Ty)); + Builder.CreateLifetimeStart(Struct, SizeOf); for (unsigned i = 0; i < Values.size(); i++) { Value *Address = Builder.CreateStructGEP(Struct, i); @@ -253,121 +306,82 @@ return Struct; } -void OMPGenerator::extractValuesFromStruct(SetVector OldValues, - Value *Struct, - ValueToValueMapTy &Map) { +void ParallelLoopGenerator::extractValuesFromStruct( + SetVector OldValues, Value *Struct, ValueToValueMapTy &Map) { for (unsigned i = 0; i < OldValues.size(); i++) { Value *Address = Builder.CreateStructGEP(Struct, i); Value *NewValue = Builder.CreateLoad(Address); - Map.insert(std::make_pair(OldValues[i], NewValue)); + Map[OldValues[i]] = NewValue; } } -Value *OMPGenerator::createSubfunction(Value *Stride, Value *StructData, - SetVector Data, - ValueToValueMapTy &Map, - Function **SubFunction) { - Function *FN = createSubfunctionDefinition(); - - BasicBlock *PrevBB, *HeaderBB, *ExitBB, *CheckNextBB, *LoadIVBoundsBB, - *AfterBB; - Value *LowerBoundPtr, *UpperBoundPtr, *UserContext, *Ret1, *HasNextSchedule, - *LowerBound, *UpperBound, *IV; - Type *IntPtrTy = getIntPtrTy(); - LLVMContext &Context = FN->getContext(); +Value *ParallelLoopGenerator::createSubFn(Value *Stride, Value *StructData, + SetVector Data, + ValueToValueMapTy &Map, + Function **SubFnPtr, + ICmpInst::Predicate Predicate) { + BasicBlock *PrevBB, *HeaderBB, *ExitBB, *CheckNextBB, *PreHeaderBB, *AfterBB; + Value *LBPtr, *UBPtr, *UserContext, *Ret1, *HasNextSchedule, *LB, *UB, *IV; + Type *IntPtrTy = IVType; + Function *SubFn = createSubFnDefinition(); + LLVMContext &Context = SubFn->getContext(); // Store the previous basic block. PrevBB = Builder.GetInsertBlock(); // Create basic blocks. - HeaderBB = BasicBlock::Create(Context, "omp.setup", FN); - ExitBB = BasicBlock::Create(Context, "omp.exit", FN); - CheckNextBB = BasicBlock::Create(Context, "omp.checkNext", FN); - LoadIVBoundsBB = BasicBlock::Create(Context, "omp.loadIVBounds", FN); + HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn); + ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn); + CheckNextBB = BasicBlock::Create(Context, "polly.par.checkNext", SubFn); + PreHeaderBB = BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn); - DominatorTree &DT = P->getAnalysis().getDomTree(); DT.addNewBlock(HeaderBB, PrevBB); DT.addNewBlock(ExitBB, HeaderBB); DT.addNewBlock(CheckNextBB, HeaderBB); - DT.addNewBlock(LoadIVBoundsBB, HeaderBB); + DT.addNewBlock(PreHeaderBB, HeaderBB); // Fill up basic block HeaderBB. Builder.SetInsertPoint(HeaderBB); - LowerBoundPtr = Builder.CreateAlloca(IntPtrTy, 0, "omp.lowerBoundPtr"); - UpperBoundPtr = Builder.CreateAlloca(IntPtrTy, 0, "omp.upperBoundPtr"); - UserContext = Builder.CreateBitCast(FN->arg_begin(), StructData->getType(), - "omp.userContext"); + LBPtr = Builder.CreateAlloca(IntPtrTy, 0, "polly.par.LBPtr"); + UBPtr = Builder.CreateAlloca(IntPtrTy, 0, "polly.par.UBPtr"); + UserContext = Builder.CreateBitCast(SubFn->arg_begin(), StructData->getType(), + "polly.par.userContext"); extractValuesFromStruct(Data, UserContext, Map); Builder.CreateBr(CheckNextBB); // Add code to check if another set of iterations will be executed. Builder.SetInsertPoint(CheckNextBB); - Ret1 = createCallLoopNext(LowerBoundPtr, UpperBoundPtr); + Ret1 = createCallGetWorkItem(LBPtr, UBPtr); HasNextSchedule = Builder.CreateTrunc(Ret1, Builder.getInt1Ty(), - "omp.hasNextScheduleBlock"); - Builder.CreateCondBr(HasNextSchedule, LoadIVBoundsBB, ExitBB); + "polly.par.hasNextScheduleBlock"); + Builder.CreateCondBr(HasNextSchedule, PreHeaderBB, ExitBB); // Add code to to load the iv bounds for this set of iterations. - Builder.SetInsertPoint(LoadIVBoundsBB); - LowerBound = Builder.CreateLoad(LowerBoundPtr, "omp.lowerBound"); - UpperBound = Builder.CreateLoad(UpperBoundPtr, "omp.upperBound"); + Builder.SetInsertPoint(PreHeaderBB); + LB = Builder.CreateLoad(LBPtr, "polly.par.LB"); + UB = Builder.CreateLoad(UBPtr, "polly.par.UB"); // Subtract one as the upper bound provided by openmp is a < comparison // whereas the codegenForSequential function creates a <= comparison. - UpperBound = Builder.CreateSub(UpperBound, ConstantInt::get(IntPtrTy, 1), - "omp.upperBoundAdjusted"); + if (Predicate == ICmpInst::ICMP_SLE) + UB = Builder.CreateSub(UB, ConstantInt::get(IntPtrTy, 1), + "polly.par.UBAdjusted"); Builder.CreateBr(CheckNextBB); Builder.SetInsertPoint(--Builder.GetInsertPoint()); - LoopInfo &LI = P->getAnalysis(); - IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, LI, DT, AfterBB, + IV = createLoop(LB, UB, Stride, Builder, P, LI, DT, AfterBB, ICmpInst::ICMP_SLE); BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); - Builder.SetInsertPoint(AfterBB->begin()); - // Add code to terminate this openmp subfunction. + // Add code to terminate this subfunction. Builder.SetInsertPoint(ExitBB); - createCallLoopEndNowait(); + createCallCleanupThread(); Builder.CreateRetVoid(); Builder.SetInsertPoint(LoopBody); - *SubFunction = FN; - - return IV; -} - -Value *OMPGenerator::createParallelLoop(Value *LowerBound, Value *UpperBound, - Value *Stride, - SetVector &Values, - ValueToValueMapTy &Map, - BasicBlock::iterator *LoopBody) { - Value *Struct, *IV, *SubfunctionParam, *NumberOfThreads; - Function *SubFunction; - - Struct = loadValuesIntoStruct(Values); - - BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint(); - IV = createSubfunction(Stride, Struct, Values, Map, &SubFunction); - *LoopBody = Builder.GetInsertPoint(); - Builder.SetInsertPoint(PrevInsertPoint); - - // Create call for GOMP_parallel_loop_runtime_start. - SubfunctionParam = - Builder.CreateBitCast(Struct, Builder.getInt8PtrTy(), "omp_data"); - - NumberOfThreads = Builder.getInt32(0); - - // Add one as the upper bound provided by openmp is a < comparison - // whereas the codegenForSequential function creates a <= comparison. - UpperBound = - Builder.CreateAdd(UpperBound, ConstantInt::get(getIntPtrTy(), 1)); - - createCallParallelLoopStart(SubFunction, SubfunctionParam, NumberOfThreads, - LowerBound, UpperBound, Stride); - Builder.CreateCall(SubFunction, SubfunctionParam); - createCallParallelEnd(); + *SubFnPtr = SubFn; return IV; } Index: test/Cloog/CodeGen/OpenMP/20120330-argument-use.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/20120330-argument-use.ll +++ test/Cloog/CodeGen/OpenMP/20120330-argument-use.ll @@ -1,32 +1,8 @@ ; RUN: opt %loadPolly -basicaa -polly-codegen -enable-polly-openmp < %s -S | FileCheck %s - -;/* -; * ============================================================================= -; * -; * Filename: 20120330-argument-use.c -; * -; * Description: Polly OpenMP test case -; * -; * Test if the OpenMP subfunction uses the argument copy in -; * the OpenMP struct not the original one only available in -; * the original function. -; * -; * Run with -polly-codegen -enable-polly-openmp -; * -; * Author: Johannes Doerfert johannes@jdoerfert.de -; * -; * Created: 2012-03-30 -; * Modified: 2012-03-30 -; * -; * ============================================================================= -; */ ; ;void f(int * restrict A, int * restrict B, int n) { -; int i; -; -; for (i = 0; i < n; i++) { +; for (int i = 0; i < n; i++) ; A[i] = B[i] * 2; -; } ;} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -58,11 +34,11 @@ ret void } -; CHECK: %omp.userContext1 = bitcast i8* %omp.userContext to { i32, i32*, i32* }* -; CHECK: %0 = getelementptr inbounds { i32, i32*, i32* }* %omp.userContext1, i32 0, i32 0 +; CHECK: %[[PN:[._a-zA-Z0-9]*]]userContext[[NO:[0-9]*]] = bitcast i8* %{{.*}}userContext to { i32, i32*, i32* }* +; CHECK: %0 = getelementptr inbounds { i32, i32*, i32* }* %[[PN]]userContext[[NO]], i32 0, i32 0 ; CHECK: %1 = load i32* %0 -; CHECK: %2 = getelementptr inbounds { i32, i32*, i32* }* %omp.userContext1, i32 0, i32 1 +; CHECK: %2 = getelementptr inbounds { i32, i32*, i32* }* %[[PN]]userContext[[NO]], i32 0, i32 1 ; CHECK: %3 = load i32** %2 -; CHECK: %4 = getelementptr inbounds { i32, i32*, i32* }* %omp.userContext1, i32 0, i32 2 +; CHECK: %4 = getelementptr inbounds { i32, i32*, i32* }* %[[PN]]userContext[[NO]], i32 0, i32 2 ; CHECK: %5 = load i32** %4 Index: test/Cloog/CodeGen/OpenMP/clastvar_after_parallel_loop.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/clastvar_after_parallel_loop.ll +++ test/Cloog/CodeGen/OpenMP/clastvar_after_parallel_loop.ll @@ -53,4 +53,4 @@ ; CLOOG: Stmt_for_end(c2); ; CLOOG: } -; CHECK: @f.omp_subfn +; CHECK: @f.{{.*}}subfn Index: test/Cloog/CodeGen/OpenMP/copy_in_argument.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/copy_in_argument.ll +++ test/Cloog/CodeGen/OpenMP/copy_in_argument.ll @@ -31,4 +31,4 @@ ret void } -; CHECK: %omp.userContext = alloca { float } +; CHECK: %{{.*}}userContext = alloca { float } Index: test/Cloog/CodeGen/OpenMP/copy_in_temporary.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/copy_in_temporary.ll +++ test/Cloog/CodeGen/OpenMP/copy_in_temporary.ll @@ -32,4 +32,4 @@ ret void } -; CHECK: %omp.userContext = alloca { float } +; CHECK: %{{.*}}userContext = alloca { float } Index: test/Cloog/CodeGen/OpenMP/extract_memref.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/extract_memref.ll +++ test/Cloog/CodeGen/OpenMP/extract_memref.ll @@ -49,9 +49,8 @@ call void @foo() ret i32 0 } -; CHECK: getelementptr inbounds { [10 x float]* }* %omp.userContext, i32 0, i32 0 -; CHECK: store [10 x float]* %A, [10 x float]** %0 -; CHECK: %omp_data = bitcast { [10 x float]* }* %omp.userContext to i8* -; CHECK: inbounds { [10 x float]* }* %omp.userContext1, i32 0, i32 0 +; CHECK: %[[V:[._a-zA-Z0-9]+]] = getelementptr inbounds { [10 x float]* }* %[[PN:[._a-zA-Z0-9]*]]userContext, i32 0, i32 0 +; CHECK: store [10 x float]* %A, [10 x float]** %[[V]] +; CHECK: inbounds { [10 x float]* }* %[[PN]]userContext{{[0-9]*}}, i32 0, i32 0 ; CHECK: load [10 x float]** Index: test/Cloog/CodeGen/OpenMP/param_referenced_in_stmt.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/param_referenced_in_stmt.ll +++ test/Cloog/CodeGen/OpenMP/param_referenced_in_stmt.ll @@ -1,8 +1,5 @@ ; RUN: opt %loadPolly -polly-codegen < %s -enable-polly-openmp -S | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - - +; ; This test case implements the following code: ; ; for (i = 0; i < 1024; i++) @@ -10,6 +7,7 @@ ; ; The problem is that 'param' is not references in any subscript of loop ; bound, but it must still be forwarded to the OpenMP subfunction. +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define void @foo(double %param, [1024 x double]* %A) { entry: @@ -35,4 +33,4 @@ ret void } -; CHECK: omp_subfn +; CHECK: @foo{{.*}}subfn Index: test/Cloog/CodeGen/OpenMP/simple_nested_loop.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/simple_nested_loop.ll +++ test/Cloog/CodeGen/OpenMP/simple_nested_loop.ll @@ -78,15 +78,15 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind -; CHECK: %omp.userContext = alloca { i32 } -; CHECK: getelementptr inbounds { i32 }* %omp.userContext, i32 0, i32 0 -; CHECK: store i32 %polly.indvar, i32* %0 -; CHECK: %omp_data = bitcast { i32 }* %omp.userContext to i8* -; CHECK: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @loop_openmp.omp_subfn, i8* %omp_data, i32 0, i32 0, i32 10, i32 1) -; CHECK: call void @loop_openmp.omp_subfn(i8* %omp_data) +; CHECK: %[[PN:[._a-zA-Z0-9]*]]userContext = alloca { i32 } +; CHECK: %[[NO:[._a-zA-Z0-9]*]] = getelementptr inbounds { i32 }* %[[PN]]userContext, i32 0, i32 0 +; CHECK: store i32 %polly.indvar, i32* %[[NO]] +; CHECK: %[[DATA:[._a-zA-Z0-9]*]] = bitcast { i32 }* %[[PN]]userContext to i8* +; CHECK: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @loop_openmp{{.*}}subfn, i8* %[[DATA]], i32 0, i32 0, i32 10, i32 1) +; CHECK: call void @loop_openmp{{.*}}subfn(i8* %[[DATA]]) ; CHECK: call void @GOMP_parallel_end() ; Verify the new subfunction is annotated such that SCoP detection will skip it. -; CHECK: @loop_openmp.omp_subfn({{.*}}) [[ATTR:#[0-9]+]] +; CHECK: @loop_openmp{{.*}}subfn({{.*}}) [[ATTR:#[0-9]+]] ; CHECK: attributes [[ATTR]] = {{{[^\}]*}}polly.skip.fn{{[^\}]*}}} Index: test/Cloog/CodeGen/OpenMP/structnames.ll =================================================================== --- test/Cloog/CodeGen/OpenMP/structnames.ll +++ test/Cloog/CodeGen/OpenMP/structnames.ll @@ -100,6 +100,6 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind -; CHECK: %omp.userContext = alloca {} -; CHECK: %omp.userContext1 = alloca { i32 } +; CHECK-DAG: %{{[._a-zA-Z0-9]*}}userContext{{[0-9]*}} = alloca {} +; CHECK-DAG: %{{[._a-zA-Z0-9]*}}userContext{{[0-9]*}} = alloca { i32 }