Index: include/polly/CodeGen/IslExprBuilder.h =================================================================== --- include/polly/CodeGen/IslExprBuilder.h +++ include/polly/CodeGen/IslExprBuilder.h @@ -14,9 +14,9 @@ #include "polly/CodeGen/IRBuilder.h" -#include "isl/ast.h" +#include "llvm/ADT/MapVector.h" -#include +#include "isl/ast.h" namespace polly { @@ -77,7 +77,7 @@ class IslExprBuilder { public: /// @brief A map from isl_ids to llvm::Values. - typedef std::map IDToValueTy; + typedef llvm::MapVector IDToValueTy; /// @brief Construct an IslExprBuilder. /// @@ -118,7 +118,7 @@ private: PollyIRBuilder &Builder; - std::map &IDToValue; + IDToValueTy &IDToValue; llvm::Value *createOp(__isl_take isl_ast_expr *Expr); llvm::Value *createOpUnary(__isl_take isl_ast_expr *Expr); Index: lib/CodeGen/IslCodeGeneration.cpp =================================================================== --- lib/CodeGen/IslCodeGeneration.cpp +++ lib/CodeGen/IslCodeGeneration.cpp @@ -32,6 +32,7 @@ #include "polly/Support/GICHelper.h" #include "polly/Support/ScopHelper.h" #include "polly/TempScopInfo.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -81,6 +82,12 @@ // ivs. IslExprBuilder::IDToValueTy IDToValue; + // A set of Value -> Value remappings to apply. + // + // When generating code for new statements this map is used to remap certain + // llvm::Values. + ValueMapT ValueMap; + // Extract the upper bound of this loop // // The isl code generation can generate arbitrary expressions to check if the @@ -104,10 +111,13 @@ CmpInst::Predicate &Predicate); unsigned getNumberOfIterations(__isl_keep isl_ast_node *For); + SetVector getOMPValues(__isl_keep isl_ast_node *For); + void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); void createFor(__isl_take isl_ast_node *For); void createForVector(__isl_take isl_ast_node *For, int VectorWidth); void createForSequential(__isl_take isl_ast_node *For); + void createForOpenMP(__isl_take isl_ast_node *For); /// Generate LLVM-IR that computes the values of the original induction /// variables in function of the newly generated loop induction variables. @@ -224,6 +234,57 @@ return NumberOfIterations + 1; } +SetVector IslNodeBuilder::getOMPValues(__isl_keep isl_ast_node *For) { + SetVector Values; + + for (const auto &I : IDToValue) + Values.insert(I.second); + + isl_union_set *Schedule = isl_union_map_domain(IslAstInfo::getSchedule(For)); + + auto Func = [](isl_set *Set, void *User) { + isl_id *Id = isl_set_get_tuple_id(Set); + SetVector &Values = *static_cast *>(User); + const ScopStmt *Stmt = static_cast(isl_id_get_user(Id)); + const BasicBlock *BB = Stmt->getBasicBlock(); + + // Check all the operands of instructions in the basic block. + for (const Instruction &Inst : *BB) { + for (Value *SrcVal : Inst.operands()) { + if (Instruction *OpInst = dyn_cast(SrcVal)) + if (Stmt->getParent()->getRegion().contains(OpInst)) + continue; + + if (isa(SrcVal) || isa(SrcVal)) + Values.insert(SrcVal); + } + } + isl_id_free(Id); + isl_set_free(Set); + return 0; + }; + isl_union_set_foreach_set(Schedule, Func, &Values); + isl_union_set_free(Schedule); + + return Values; +} + +void IslNodeBuilder::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) { + std::set Inserted; + + for (const auto &I : IDToValue) { + IDToValue[I.first] = VMap[I.second]; + Inserted.insert(I.second); + } + + for (const auto &I : VMap) { + if (Inserted.count(I.first)) + continue; + + ValueMap[I.first] = I.second; + } +} + void IslNodeBuilder::createUserVector(__isl_take isl_ast_node *User, std::vector &IVS, __isl_take isl_id *IteratorID, @@ -300,7 +361,7 @@ llvm_unreachable("Unhandled isl_ast_node in vectorizer"); } - IDToValue.erase(IteratorID); + IDToValue.erase(IDToValue.find(IteratorID)); isl_id_free(IteratorID); isl_union_map_free(Schedule); @@ -364,7 +425,7 @@ Annotator.End(); - IDToValue.erase(IteratorID); + IDToValue.erase(IDToValue.find(IteratorID)); Builder.SetInsertPoint(ExitBlock->begin()); @@ -373,6 +434,82 @@ isl_id_free(IteratorID); } +static void clearDomtree(Function *F, DominatorTree &DT) { + DomTreeNode *N = DT.getNode(&F->getEntryBlock()); + std::vector Nodes; + for (po_iterator I = po_begin(N), E = po_end(N); I != E; ++I) + Nodes.push_back(I->getBlock()); + + for (BasicBlock *BB : Nodes) + DT.eraseNode(BB); +} + +void IslNodeBuilder::createForOpenMP(__isl_take isl_ast_node *For) { + isl_ast_node *Body; + isl_ast_expr *Init, *Inc, *Iterator, *UB; + isl_id *IteratorID; + Value *ValueLB, *ValueUB, *ValueInc; + Type *MaxType; + Value *IV; + CmpInst::Predicate Predicate; + + Body = isl_ast_node_for_get_body(For); + + Init = isl_ast_node_for_get_init(For); + Inc = isl_ast_node_for_get_inc(For); + Iterator = isl_ast_node_for_get_iterator(For); + IteratorID = isl_ast_expr_get_id(Iterator); + UB = getUpperBound(For, Predicate); + + ValueLB = ExprBuilder.create(Init); + ValueUB = ExprBuilder.create(UB); + ValueInc = ExprBuilder.create(Inc); + + MaxType = ExprBuilder.getType(Iterator); + MaxType = ExprBuilder.getWidestType(MaxType, ValueLB->getType()); + MaxType = ExprBuilder.getWidestType(MaxType, ValueUB->getType()); + MaxType = ExprBuilder.getWidestType(MaxType, ValueInc->getType()); + + if (MaxType != ValueLB->getType()) + ValueLB = Builder.CreateSExt(ValueLB, MaxType); + if (MaxType != ValueUB->getType()) + ValueUB = Builder.CreateSExt(ValueUB, MaxType); + if (MaxType != ValueInc->getType()) + ValueInc = Builder.CreateSExt(ValueInc, MaxType); + + BasicBlock::iterator LoopBody; + SetVector Values = getOMPValues(For); + OMPGenerator::ValueToValueMapTy VMap; + OMPGenerator OMPGen(Builder, P); + + IV = OMPGen.createParallelLoop(ValueLB, ValueUB, ValueInc, Values, VMap, + &LoopBody); + BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); + Builder.SetInsertPoint(LoopBody); + + // Save the current values. + ValueMapT ValueMapCopy = ValueMap; + IslExprBuilder::IDToValueTy IDToValueCopy = IDToValue; + + updateWithValueMap(VMap); + IDToValue[IteratorID] = IV; + + create(Body); + + // Restore the original values. + ValueMap = ValueMapCopy; + IDToValue = IDToValueCopy; + + Builder.SetInsertPoint(AfterLoop); + + clearDomtree((*LoopBody).getParent()->getParent(), + P->getAnalysis().getDomTree()); + + isl_ast_node_free(For); + isl_ast_expr_free(Iterator); + isl_id_free(IteratorID); +} + void IslNodeBuilder::createFor(__isl_take isl_ast_node *For) { bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; @@ -384,6 +521,12 @@ return; } } + + if (IslAstInfo::isOutermostParallel(For) && + !IslAstInfo::isReductionParallel(For)) { + createForOpenMP(For); + return; + } createForSequential(For); } @@ -458,6 +601,7 @@ VMap[OldIV] = V; } } + VMap.insert(ValueMap.begin(), ValueMap.end()); isl_ast_expr_free(Expr); } @@ -623,6 +767,7 @@ virtual void printScop(raw_ostream &OS) const {} virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); Index: test/Isl/CodeGen/OpenMP/loop-body-references-outer-values.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/OpenMP/loop-body-references-outer-values.ll @@ -0,0 +1,46 @@ +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-ast -analyze < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-codegen-isl -S < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-codegen-isl -S -polly-codegen-scev < %s | FileCheck %s -check-prefix=IR +; +; void loop_references_outer_ids(float *A) { +; for (long i = 0; i < 100; i++) +; A[i] = i; +; } +; +; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction +; + +; AST: #pragma simd +; AST: #pragma omp parallel for +; AST: for (int c1 = 0; c1 <= 99; c1 += 1) +; AST: Stmt_for_body(c1); + +; IR: %omp.userContext = alloca { float* } +; IR-NEXT: %[[gep:[._a-zA-Z0-9]*]] = getelementptr inbounds { float* }* %omp.userContext, i32 0, i32 0 +; IR-NEXT: store float* %A, float** %[[gep]] +; IR-NEXT: %omp_data = bitcast { float* }* %omp.userContext to i8* + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @loop_references_outer_ids(float* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] + %exitcond = icmp ne i64 %i.0, 100 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %conv = sitofp i64 %i.0 to float + %arrayidx = getelementptr inbounds float* %A, i64 %i.0 + store float %conv, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i64 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Isl/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll @@ -0,0 +1,104 @@ +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-ast -analyze < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-codegen-isl -S < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-codegen-isl -S -polly-codegen-scev < %s | FileCheck %s -check-prefix=IR +; +; float A[100]; +; +; void loop_references_outer_ids(long n) { +; for (long i = 0; i < 100; i++) +; for (long j = 0; j < 100; j++) +; for (long k = 0; k < n + i; k++) +; A[j] += i + j + k; +; } + +; In this test case we verify that the j-loop is generated as OpenMP parallel +; loop and that the values of 'i' and 'n', needed in the loop bounds of the +; k-loop, are correctly passed to the subfunction. + +; AST: #pragma minimal dependence distance: 1 +; AST: for (int c1 = max(0, -n + 1); c1 <= 99; c1 += 1) +; AST: #pragma omp parallel for +; AST: for (int c3 = 0; c3 <= 99; c3 += 1) +; AST: #pragma minimal dependence distance: 1 +; AST: for (int c5 = 0; c5 < n + c1; c5 += 1) +; AST: Stmt_for_body6(c1, c3, c5); + +; IR: %omp.userContext = alloca { [100 x float]*, i64, i64 } +; IR-NEXT: %4 = getelementptr inbounds { [100 x float]*, i64, i64 }* %omp.userContext, i32 0, i32 0 +; IR-NEXT: store [100 x float]* @A, [100 x float]** %4 +; IR-NEXT: %5 = getelementptr inbounds { [100 x float]*, i64, i64 }* %omp.userContext, i32 0, i32 1 +; IR-NEXT: store i64 %n, i64* %5 +; IR-NEXT: %6 = getelementptr inbounds { [100 x float]*, i64, i64 }* %omp.userContext, i32 0, i32 2 +; IR-NEXT: store i64 %polly.indvar, i64* %6 +; IR-NEXT: %omp_data = bitcast { [100 x float]*, i64, i64 }* %omp.userContext to i8* + +; IR-LABEL: @loop_references_outer_ids.omp_subfn(i8* %omp.userContext) +; IR: %omp.userContext1 = bitcast i8* %omp.userContext to { [100 x float]*, i64, i64 }* +; IR-NEXT: %0 = getelementptr inbounds { [100 x float]*, i64, i64 }* %omp.userContext1, i32 0, i32 0 +; IR-NEXT: %1 = load [100 x float]** %0 +; IR-NEXT: %2 = getelementptr inbounds { [100 x float]*, i64, i64 }* %omp.userContext1, i32 0, i32 1 +; IR-NEXT: %3 = load i64* %2 +; IR-NEXT: %4 = getelementptr inbounds { [100 x float]*, i64, i64 }* %omp.userContext1, i32 0, i32 2 +; IR-NEXT: %5 = load i64* %4 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@A = common global [100 x float] zeroinitializer, align 16 + +define void @loop_references_outer_ids(i64 %n) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc13, %entry + %i.0 = phi i64 [ 0, %entry ], [ %inc14, %for.inc13 ] + %exitcond1 = icmp ne i64 %i.0, 100 + br i1 %exitcond1, label %for.body, label %for.end15 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc10, %for.body + %j.0 = phi i64 [ 0, %for.body ], [ %inc11, %for.inc10 ] + %exitcond = icmp ne i64 %j.0, 100 + br i1 %exitcond, label %for.body3, label %for.end12 + +for.body3: ; preds = %for.cond1 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %k.0 = phi i64 [ 0, %for.body3 ], [ %inc, %for.inc ] + %add = add nsw i64 %i.0, %n + %cmp5 = icmp slt i64 %k.0, %add + br i1 %cmp5, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %add7 = add nsw i64 %i.0, %j.0 + %add8 = add nsw i64 %add7, %k.0 + %conv = sitofp i64 %add8 to float + %arrayidx = getelementptr inbounds [100 x float]* @A, i64 0, i64 %j.0 + %tmp = load float* %arrayidx, align 4 + %add9 = fadd float %tmp, %conv + store float %add9, float* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %inc = add nsw i64 %k.0, 1 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc10 + +for.inc10: ; preds = %for.end + %inc11 = add nsw i64 %j.0, 1 + br label %for.cond1 + +for.end12: ; preds = %for.cond1 + br label %for.inc13 + +for.inc13: ; preds = %for.end12 + %inc14 = add nsw i64 %i.0, 1 + br label %for.cond + +for.end15: ; preds = %for.cond + ret void +} Index: test/Isl/CodeGen/OpenMP/single_loop.ll =================================================================== --- /dev/null +++ test/Isl/CodeGen/OpenMP/single_loop.ll @@ -0,0 +1,110 @@ +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-ast -analyze < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-codegen-isl -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-codegen-isl -S -polly-codegen-scev -verify-dom-info < %s | FileCheck %s -check-prefix=IR + + +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-import-jscop -polly-import-jscop-dir=%S -polly-ast -analyze < %s | FileCheck %s -check-prefix=AST-STRIDE4 +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-import-jscop -polly-import-jscop-dir=%S -polly-codegen-isl -S < %s | FileCheck %s -check-prefix=IR-STRIDE4 +; RUN: opt %loadPolly -polly-ast-detect-parallel -polly-import-jscop -polly-import-jscop-dir=%S -polly-codegen-isl -polly-codegen-scev -S < %s | FileCheck %s -check-prefix=IR-STRIDE4 +; #define N 1024 +; float A[N]; +; +; void single_parallel_loop(void) { +; for (long i = 0; i < N; i++) +; A[i] = 1; +; } + +; AST: #pragma simd +; AST: #pragma omp parallel for +; AST: for (int c1 = 0; c1 <= 1023; c1 += 1) +; AST: Stmt_S(c1); + +; AST-STRIDE4: #pragma omp parallel for +; AST-STRIDE4: for (int c1 = 0; c1 <= 1023; c1 += 4) +; AST-STRIDE4: #pragma simd +; AST-STRIDE4: for (int c2 = c1; c2 <= c1 + 3; c2 += 1) +; AST-STRIDE4: Stmt_S(c2); + +; IR-LABEL: single_parallel_loop() + +; IR-LABEL: polly.start: +; IR-NEXT: %omp.userContext = alloca +; IR: %omp_data = bitcast +; IR-NEXT: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @single_parallel_loop.omp_subfn, i8* %omp_data, i32 0, i64 0, i64 1024, i64 1) +; IR-NEXT: call void @single_parallel_loop.omp_subfn(i8* %omp_data) +; IR-NEXT: call void @GOMP_parallel_end() +; IR-NEXT: br label %polly.merge_new_and_old + +; IR: define internal void @single_parallel_loop.omp_subfn(i8* %omp.userContext) #1 +; IR-LABEL: omp.setup: +; IR-NEXT: %omp.lowerBoundPtr = alloca i64 +; IR-NEXT: %omp.upperBoundPtr = alloca i64 +; IR-NEXT: %omp.userContext1 = +; IR: br label %omp.checkNext + +; IR-LABEL: omp.exit: +; IR-NEXT: call void @GOMP_loop_end_nowait() +; IR-NEXT: ret void + +; IR-LABEL: omp.checkNext: +; IR-NEXT: %[[gompnext:[._a-zA-Z0-9]*]] = call i8 @GOMP_loop_runtime_next(i64* %omp.lowerBoundPtr, i64* %omp.upperBoundPtr) +; IR-NEXT: %[[cmp:[._a-zA-Z0-9]*]] = icmp ne i8 %[[gompnext]], 0 +; IR-NEXT: br i1 %[[cmp]], label %omp.loadIVBounds, label %omp.exit + +; IR-LABEL: omp.loadIVBounds: +; IR-NEXT: %omp.lowerBound = load i64* %omp.lowerBoundPtr +; IR-NEXT: %omp.upperBound = load i64* %omp.upperBoundPtr +; IR-NEXT: %omp.upperBoundAdjusted = sub i64 %omp.upperBound, 1 +; IR-NEXT: br label %polly.loop_preheader + +; IR-LABEL: polly.loop_exit: +; IR-NEXT: br label %omp.checkNext + +; IR-LABEL: polly.loop_header: +; IR-NEXT: %polly.indvar = phi i64 [ %omp.lowerBound, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.S ] +; IR-NEXT: br label %polly.stmt.S + +; IR-LABEL: polly.stmt.S: +; IR-NEXT: %[[gep:[._a-zA-Z0-9]*]] = getelementptr [1024 x float]* {{.*}}, i64 0, i64 %polly.indvar +; IR-NEXT: store float 1.000000e+00, float* %[[gep]] +; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 +; IR-NEXT: %polly.adjust_ub = sub i64 %omp.upperBoundAdjusted, 1 +; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, %polly.adjust_ub +; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit + +; IR-LABEL: polly.loop_preheader: +; IR-NEXT: br label %polly.loop_header + +; IR: attributes #1 = { "polly.skip.fn" } + +; IR-STRIDE4: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @single_parallel_loop.omp_subfn, i8* %omp_data, i32 0, i64 0, i64 1024, i64 4) +; IR-STRIDE4: add nsw i64 %polly.indvar, 3 +; IR-STRIDE4: %polly.indvar_next = add nsw i64 %polly.indvar, 4 +; IR-STRIDE4 %polly.adjust_ub = sub i64 %omp.upperBoundAdjusted, 4 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [1024 x float] zeroinitializer, align 16 + +define void @single_parallel_loop() nounwind { +entry: + br label %for.i + +for.i: + %indvar = phi i64 [ %indvar.next, %for.inc], [ 0, %entry ] + %scevgep = getelementptr [1024 x float]* @A, i64 0, i64 %indvar + %exitcond = icmp ne i64 %indvar, 1024 + br i1 %exitcond, label %S, label %exit + +S: + store float 1.0, float* %scevgep + br label %for.inc + +for.inc: + %indvar.next = add i64 %indvar, 1 + br label %for.i + +exit: + ret void +} Index: test/Isl/CodeGen/OpenMP/single_parallel_loop___%for.i---%exit.jscop =================================================================== --- /dev/null +++ test/Isl/CodeGen/OpenMP/single_parallel_loop___%for.i---%exit.jscop @@ -0,0 +1,17 @@ +{ + "context" : "{ : }", + "name" : "for.i => exit", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_S[i0] -> MemRef_A[i0] }" + } + ], + "domain" : "{ Stmt_S[i0] : i0 >= 0 and i0 <= 1023 }", + "name" : "Stmt_S", + "schedule" : "{ Stmt_S[i0] -> scattering[0, floor(i0/4) * 4, i0] }" + } + ] +}