Index: include/polly/CodeGen/IslNodeBuilder.h =================================================================== --- include/polly/CodeGen/IslNodeBuilder.h +++ include/polly/CodeGen/IslNodeBuilder.h @@ -242,7 +242,7 @@ bool preloadInvariantEquivClass(const InvariantEquivClassTy &IAClass); void createForVector(__isl_take isl_ast_node *For, int VectorWidth); - void createForSequential(__isl_take isl_ast_node *For); + void createForSequential(__isl_take isl_ast_node *For, bool KnownParallel); /// Create LLVM-IR that executes a for node thread parallel. /// Index: lib/CodeGen/IslAst.cpp =================================================================== --- lib/CodeGen/IslAst.cpp +++ lib/CodeGen/IslAst.cpp @@ -255,11 +255,13 @@ // tested for parallelism. Test them here to ensure we check all innermost // loops for parallelism. if (Payload->IsInnermost && BuildInfo->InParallelFor) { - if (Payload->IsOutermostParallel) + if (Payload->IsOutermostParallel) { Payload->IsInnermostParallel = true; - else - Payload->IsInnermostParallel = - astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload); + } else { + if (PollyVectorizerChoice == VECTORIZER_NONE) + Payload->IsInnermostParallel = + astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload); + } } if (Payload->IsOutermostParallel) BuildInfo->InParallelFor = false; @@ -268,6 +270,31 @@ return Node; } +static isl_stat astBuildBeforeMark(__isl_keep isl_id *MarkId, + __isl_keep isl_ast_build *Build, + void *User) { + if (!MarkId) + return isl_stat_error; + + AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User; + if (!strcmp(isl_id_get_name(MarkId), "SIMD")) + BuildInfo->InParallelFor = true; + + return isl_stat_ok; +} + +static __isl_give isl_ast_node * +astBuildAfterMark(__isl_take isl_ast_node *Node, + __isl_keep isl_ast_build *Build, void *User) { + assert(isl_ast_node_get_type(Node) == isl_ast_node_mark); + AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User; + auto *Id = isl_ast_node_mark_get_id(Node); + if (!strcmp(isl_id_get_name(Id), "SIMD")) + BuildInfo->InParallelFor = false; + isl_id_free(Id); + return Node; +} + static __isl_give isl_ast_node *AtEachDomain(__isl_take isl_ast_node *Node, __isl_keep isl_ast_build *Build, void *User) { @@ -383,6 +410,12 @@ &BuildInfo); Build = isl_ast_build_set_after_each_for(Build, &astBuildAfterFor, &BuildInfo); + + Build = isl_ast_build_set_before_each_mark(Build, &astBuildBeforeMark, + &BuildInfo); + + Build = isl_ast_build_set_after_each_mark(Build, &astBuildAfterMark, + &BuildInfo); } buildRunCondition(Build); Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -352,9 +352,24 @@ } void IslNodeBuilder::createMark(__isl_take isl_ast_node *Node) { + auto *Id = isl_ast_node_mark_get_id(Node); auto Child = isl_ast_node_mark_get_node(Node); - create(Child); isl_ast_node_free(Node); + // If a child node of a 'SIMD mark' is a loop that has a single iteration, + // it will be optimized away and we should skip it. + if (!strcmp(isl_id_get_name(Id), "SIMD") && + isl_ast_node_get_type(Child) == isl_ast_node_for) { + bool Vector = PollyVectorizerChoice == VECTORIZER_POLLY; + int VectorWidth = getNumberOfIterations(Child); + if (Vector && 1 < VectorWidth && VectorWidth <= 16) + createForVector(Child, VectorWidth); + else + createForSequential(Child, true); + isl_id_free(Id); + return; + } + create(Child); + isl_id_free(Id); } void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For, @@ -417,7 +432,8 @@ isl_ast_expr_free(Iterator); } -void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For) { +void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For, + bool KnownParallel) { isl_ast_node *Body; isl_ast_expr *Init, *Inc, *Iterator, *UB; isl_id *IteratorID; @@ -428,8 +444,8 @@ CmpInst::Predicate Predicate; bool Parallel; - Parallel = - IslAstInfo::isParallel(For) && !IslAstInfo::isReductionParallel(For); + Parallel = KnownParallel || (IslAstInfo::isParallel(For) && + !IslAstInfo::isReductionParallel(For)); Body = isl_ast_node_for_get_body(For); @@ -647,7 +663,7 @@ createForParallel(For); return; } - createForSequential(For); + createForSequential(For, false); } void IslNodeBuilder::createIf(__isl_take isl_ast_node *If) { Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -289,6 +289,10 @@ Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }")); Node = isl_schedule_node_band_sink(Node); Node = isl_schedule_node_child(Node, 0); + if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf) + Node = isl_schedule_node_parent(Node); + isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr); + Node = isl_schedule_node_insert_mark(Node, LoopMarker); return Node; } Index: test/Isl/CodeGen/simple_vec_strides_multidim.ll =================================================================== --- test/Isl/CodeGen/simple_vec_strides_multidim.ll +++ test/Isl/CodeGen/simple_vec_strides_multidim.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-codegen -polly-vectorizer=polly -S -dce < %s | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-codegen -polly-vectorizer=polly -polly-prevect-width=8 -S -dce < %s | FileCheck %s ; ; void foo(long n, float A[restrict][n], float B[restrict][n], ; float C[restrict][n], float D[restrict][n]) { Index: test/ScheduleOptimizer/full_partial_tile_separation.ll =================================================================== --- test/ScheduleOptimizer/full_partial_tile_separation.ll +++ test/ScheduleOptimizer/full_partial_tile_separation.ll @@ -1,23 +1,26 @@ ; RUN: opt -S %loadPolly -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze < %s | FileCheck %s ; CHECK: // 1st level tiling - Tiles -; CHECK-NEXT: #pragma known-parallel -; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1) -; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1) -; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) { -; CHECK-NEXT: // 1st level tiling - Points -; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) { -; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1) -; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) -; CHECK-NEXT: #pragma simd -; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1) -; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5); -; CHECK-NEXT: if (32 * c1 + 31 >= nj) -; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) -; CHECK-NEXT: #pragma simd -; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1) -; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5); -; CHECK-NEXT: } -; CHECK-NEXT: } +; CHECK-NEXT: #pragma known-parallel +; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1) +; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) { +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) { +; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) { +; CHECK-NEXT: // SIMD +; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1) +; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: if (32 * c1 + 31 >= nj) +; CHECK-NEXT: #pragma minimal dependence distance: 1 +; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) { +; CHECK-NEXT: // SIMD +; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1) +; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } ; Function Attrs: nounwind uwtable define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 { Index: test/ScheduleOptimizer/prevectorization-without-tiling.ll =================================================================== --- test/ScheduleOptimizer/prevectorization-without-tiling.ll +++ test/ScheduleOptimizer/prevectorization-without-tiling.ll @@ -56,14 +56,14 @@ ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c2 = 0; c2 <= 3; c2 += 1) ; CHECK: Stmt_for_body3(c0, 4 * c1 + c2); ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1) ; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c3 = 0; c3 <= 3; c3 += 1) ; CHECK: Stmt_for_body8(c0, 4 * c1 + c3, c2); Index: test/ScheduleOptimizer/prevectorization.ll =================================================================== --- test/ScheduleOptimizer/prevectorization.ll +++ test/ScheduleOptimizer/prevectorization.ll @@ -65,7 +65,7 @@ ; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1) ; CHECK: for (int c2 = 0; c2 <= 31; c2 += 1) ; CHECK: for (int c3 = 0; c3 <= 7; c3 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c4 = 0; c4 <= 3; c4 += 1) ; CHECK: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4); ; CHECK: #pragma known-parallel @@ -75,7 +75,7 @@ ; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1) ; CHECK: for (int c4 = 0; c4 <= 7; c4 += 1) ; CHECK: for (int c5 = 0; c5 <= 31; c5 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1) ; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5); @@ -85,7 +85,7 @@ ; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1) ; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1) ; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1) -; VEC16: #pragma simd +; VEC16: // SIMD ; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1) ; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4); ; VEC16: #pragma known-parallel @@ -95,7 +95,7 @@ ; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1) ; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1) ; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1) -; VEC16: #pragma simd +; VEC16: // SIMD ; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1) ; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5); ; VEC16: } Index: test/ScheduleOptimizer/rectangular-tiling.ll =================================================================== --- test/ScheduleOptimizer/rectangular-tiling.ll +++ test/ScheduleOptimizer/rectangular-tiling.ll @@ -74,10 +74,10 @@ ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c3 = 0; c3 <= 1; c3 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c4 = 0; c4 <= 7; c4 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c5 = 0; c5 <= 1; c5 += 1) { -; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd +; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4, 16 * c1 + 8 * c3 + 4 * c5 + c8); -; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd +; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4 + 1, 16 * c1 + 8 * c3 + 4 * c5 + c8); ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: } Index: test/ScopInfo/stride_detection.ll =================================================================== --- test/ScopInfo/stride_detection.ll +++ test/ScopInfo/stride_detection.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-dir=%S -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s +; RUN: opt %loadPolly -polly-opt-isl -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s ; #pragma known-parallel ; for (int c0 = 0; c0 <= 31; c0 += 1)