Index: lib/CodeGen/IslAst.cpp =================================================================== --- lib/CodeGen/IslAst.cpp +++ lib/CodeGen/IslAst.cpp @@ -119,6 +119,9 @@ /// @brief Flag to indicate that we are inside a parallel for node. bool InParallelFor; + /// @brief Flag to indicate that we met a SIMD mark. + bool SIMDMark; + /// @brief The last iterator id created for the current SCoP. isl_id *LastForNodeId; }; @@ -250,7 +253,7 @@ BuildInfo->LastForNodeId = Id; // Test for parallelism only if we are not already inside a parallel loop - if (!BuildInfo->InParallelFor) + if (!BuildInfo->SIMDMark && !BuildInfo->InParallelFor) BuildInfo->InParallelFor = Payload->IsOutermostParallel = astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload); @@ -277,6 +280,12 @@ Payload->Build = isl_ast_build_copy(Build); Payload->IsInnermost = (Id == BuildInfo->LastForNodeId); + // Skip parallelism checks, if a SIMD mark is met. + if (BuildInfo->SIMDMark) { + isl_id_free(Id); + return Node; + } + // Innermost loops that are surrounded by parallel loops have not yet been // tested for parallelism. Test them here to ensure we check all innermost // loops for parallelism. @@ -294,6 +303,39 @@ return Node; } +// This method is executed before the construction of a mark node. It sets +// the 'SIMDMark' flag, if we meet a mark node, which isl_id has name 'SIMD'. +// This helps to skip parallelism checks. +// +static isl_stat astBuildBeforeMark(__isl_keep isl_id *MarkId, + __isl_keep isl_ast_build *Build, + void *User) { + if (!MarkId) + return isl_stat_error; + + AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User; + if (!strcmp(isl_id_get_name(MarkId), "SIMD")) + BuildInfo->SIMDMark = true; + + return isl_stat_ok; +} + +// This method is executed after the construction of a mark node. It unsets +// the 'SIMDMark' flag, if we leave a mark node, which isl_id has name 'SIMD'. +// This indicates that we shouldn't skip parallelism checks. +// +static __isl_give isl_ast_node * +astBuildAfterMark(__isl_take isl_ast_node *Node, + __isl_keep isl_ast_build *Build, void *User) { + assert(isl_ast_node_get_type(Node) == isl_ast_node_mark); + AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User; + auto *Id = isl_ast_node_mark_get_id(Node); + if (!strcmp(isl_id_get_name(Id), "SIMD")) + BuildInfo->SIMDMark = false; + isl_id_free(Id); + return Node; +} + static __isl_give isl_ast_node *AtEachDomain(__isl_take isl_ast_node *Node, __isl_keep isl_ast_build *Build, void *User) { @@ -402,11 +444,19 @@ if (PerformParallelTest) { BuildInfo.Deps = &D; BuildInfo.InParallelFor = 0; + BuildInfo.SIMDMark = 0; Build = isl_ast_build_set_before_each_for(Build, &astBuildBeforeFor, &BuildInfo); + Build = isl_ast_build_set_after_each_for(Build, &astBuildAfterFor, &BuildInfo); + + Build = isl_ast_build_set_before_each_mark(Build, &astBuildBeforeMark, + &BuildInfo); + + Build = isl_ast_build_set_after_each_mark(Build, &astBuildAfterMark, + &BuildInfo); } buildRunCondition(Build); Index: lib/CodeGen/IslNodeBuilder.cpp =================================================================== --- lib/CodeGen/IslNodeBuilder.cpp +++ lib/CodeGen/IslNodeBuilder.cpp @@ -352,9 +352,20 @@ } void IslNodeBuilder::createMark(__isl_take isl_ast_node *Node) { + auto *Id = isl_ast_node_mark_get_id(Node); auto Child = isl_ast_node_mark_get_node(Node); - create(Child); isl_ast_node_free(Node); + bool Vector = PollyVectorizerChoice == VECTORIZER_POLLY; + if (Vector && !strcmp(isl_id_get_name(Id), "SIMD")) { + int VectorWidth = getNumberOfIterations(Child); + if (1 < VectorWidth && VectorWidth <= 16) { + createForVector(Child, VectorWidth); + isl_id_free(Id); + return; + } + } + create(Child); + isl_id_free(Id); } void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For, Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -289,6 +289,10 @@ Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }")); Node = isl_schedule_node_band_sink(Node); Node = isl_schedule_node_child(Node, 0); + if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf) + Node = isl_schedule_node_parent(Node); + isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr); + Node = isl_schedule_node_insert_mark(Node, LoopMarker); return Node; } Index: test/ScheduleOptimizer/full_partial_tile_separation.ll =================================================================== --- test/ScheduleOptimizer/full_partial_tile_separation.ll +++ test/ScheduleOptimizer/full_partial_tile_separation.ll @@ -9,17 +9,17 @@ ; CHECK: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) { ; CHECK: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1) ; CHECK: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1) ; CHECK: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5); ; CHECK: if (nj >= 32 * c1 + 4 && 32 * c1 + 31 >= nj) { ; CHECK: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c6 = 0; c6 < nj % 4; c6 += 1) ; CHECK: Stmt_for_body_6(32 * c0 + c3, -((nj - 1) % 4) + nj + c6 - 1, 32 * c2 + c5); ; CHECK: } else if (32 * c1 + 3 >= nj) ; CHECK: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c6 = 0; c6 < nj - 32 * c1; c6 += 1) ; CHECK: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + c6, 32 * c2 + c5); ; CHECK: } Index: test/ScheduleOptimizer/prevectorization-without-tiling.ll =================================================================== --- test/ScheduleOptimizer/prevectorization-without-tiling.ll +++ test/ScheduleOptimizer/prevectorization-without-tiling.ll @@ -56,14 +56,14 @@ ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c2 = 0; c2 <= 3; c2 += 1) ; CHECK: Stmt_for_body3(c0, 4 * c1 + c2); ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1) ; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c3 = 0; c3 <= 3; c3 += 1) ; CHECK: Stmt_for_body8(c0, 4 * c1 + c3, c2); Index: test/ScheduleOptimizer/prevectorization.ll =================================================================== --- test/ScheduleOptimizer/prevectorization.ll +++ test/ScheduleOptimizer/prevectorization.ll @@ -65,7 +65,7 @@ ; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1) ; CHECK: for (int c2 = 0; c2 <= 31; c2 += 1) ; CHECK: for (int c3 = 0; c3 <= 7; c3 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c4 = 0; c4 <= 3; c4 += 1) ; CHECK: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4); ; CHECK: #pragma known-parallel @@ -75,7 +75,7 @@ ; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1) ; CHECK: for (int c4 = 0; c4 <= 7; c4 += 1) ; CHECK: for (int c5 = 0; c5 <= 31; c5 += 1) -; CHECK: #pragma simd +; CHECK: // SIMD ; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1) ; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5); @@ -85,7 +85,7 @@ ; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1) ; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1) ; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1) -; VEC16: #pragma simd +; VEC16: // SIMD ; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1) ; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4); ; VEC16: #pragma known-parallel @@ -95,7 +95,7 @@ ; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1) ; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1) ; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1) -; VEC16: #pragma simd +; VEC16: // SIMD ; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1) ; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5); ; VEC16: } Index: test/ScheduleOptimizer/rectangular-tiling.ll =================================================================== --- test/ScheduleOptimizer/rectangular-tiling.ll +++ test/ScheduleOptimizer/rectangular-tiling.ll @@ -74,10 +74,10 @@ ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c3 = 0; c3 <= 1; c3 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c4 = 0; c4 <= 7; c4 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c5 = 0; c5 <= 1; c5 += 1) { -; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd +; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4, 16 * c1 + 8 * c3 + 4 * c5 + c8); -; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd +; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1) ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4 + 1, 16 * c1 + 8 * c3 + 4 * c5 + c8); ; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: }