Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -24,6 +24,7 @@ #include "isl/map.h" #include "isl/options.h" #include "isl/schedule.h" +#include "isl/schedule_node.h" #include "isl/space.h" #include "polly/CodeGen/CodeGeneration.h" #include "polly/DependenceInfo.h" @@ -114,44 +115,6 @@ /// @return True, if we believe @p NewSchedule is an improvement for @p S. bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule); - static void extendScattering(Scop &S, unsigned NewDimensions); - - /// @brief Create a map that describes a n-dimensonal tiling. - /// - /// getTileMap creates a map from a n-dimensional scattering space into an - /// 2*n-dimensional scattering space. The map describes a rectangular - /// tiling. - /// - /// Example: - /// scheduleDimensions = 2, parameterDimensions = 1, TileSizes = <32, 64> - /// - /// tileMap := [p0] -> {[s0, s1] -> [t0, t1, s0, s1]: - /// t0 % 32 = 0 and t0 <= s0 < t0 + 32 and - /// t1 % 64 = 0 and t1 <= s1 < t1 + 64} - /// - /// Before tiling: - /// - /// for (i = 0; i < N; i++) - /// for (j = 0; j < M; j++) - /// S(i,j) - /// - /// After tiling: - /// - /// for (t_i = 0; t_i < N; i+=32) - /// for (t_j = 0; t_j < M; j+=64) - /// for (i = t_i; i < min(t_i + 32, N); i++) | Unknown that N % 32 = 0 - /// for (j = t_j; j < t_j + 64; j++) | Known that M % 64 = 0 - /// S(i,j) - /// - static isl_basic_map *getTileMap(isl_ctx *ctx, int scheduleDimensions); - - /// @brief Get the schedule for this band. - /// - /// Polly applies transformations like tiling on top of the isl calculated - /// value. This can influence the number of scheduling dimension. The - /// number of schedule dimensions is returned in the parameter 'Dimension'. - static isl_union_map *getScheduleForBand(isl_band *Band, int *Dimensions); - /// @brief Create a map that pre-vectorizes one scheduling dimension. /// /// getPrevectorMap creates a map that maps each input dimension to the same @@ -194,12 +157,21 @@ int ScheduleDimensions, int VectorWidth = 4); - /// @brief Get the scheduling map for a list of bands. + /// @brief Apply additional optimizations on the bands in the schedule tree. + /// + /// We are looking for an innermost band node and apply the following + /// transformations: + /// + /// - Tile the band + /// - if the band is tileable + /// - if the band has more than one loop dimension + /// + /// - Prevectorize the point loop of the tile + /// - if vectorization is enabled /// - /// Walk recursively the forest of bands to combine the schedules of the - /// individual bands to the overall schedule. In case tiling is requested, - /// the individual bands are tiled. - static isl_union_map *getScheduleForBandList(isl_band_list *BandList); + /// @param Node The schedule node to (possibly) optimize. + /// @param User A pointer to forward some use information (currently unused). + static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); static __isl_give isl_union_map * getScheduleMap(__isl_keep isl_schedule *Schedule); @@ -216,118 +188,6 @@ char IslScheduleOptimizer::ID = 0; -void IslScheduleOptimizer::extendScattering(Scop &S, unsigned NewDimensions) { - for (ScopStmt *Stmt : S) { - unsigned OldDimensions = Stmt->getNumScattering(); - isl_space *Space; - isl_map *Map, *New; - - Space = isl_space_alloc(Stmt->getIslCtx(), 0, OldDimensions, NewDimensions); - Map = isl_map_universe(Space); - - for (unsigned i = 0; i < OldDimensions; i++) - Map = isl_map_equate(Map, isl_dim_in, i, isl_dim_out, i); - - for (unsigned i = OldDimensions; i < NewDimensions; i++) - Map = isl_map_fix_si(Map, isl_dim_out, i, 0); - - Map = isl_map_align_params(Map, S.getParamSpace()); - New = isl_map_apply_range(Stmt->getScattering(), Map); - Stmt->setScattering(New); - } -} - -isl_basic_map *IslScheduleOptimizer::getTileMap(isl_ctx *ctx, - int scheduleDimensions) { - // We construct - // - // tileMap := [p0] -> {[s0, s1] -> [t0, t1, p0, p1, a0, a1]: - // s0 = a0 * 32 and s0 = p0 and t0 <= p0 < t0 + 64 and - // s1 = a1 * 64 and s1 = p1 and t1 <= p1 < t1 + 64} - // - // and project out the auxilary dimensions a0 and a1. - isl_space *Space = - isl_space_alloc(ctx, 0, scheduleDimensions, scheduleDimensions * 3); - isl_basic_map *tileMap = isl_basic_map_universe(isl_space_copy(Space)); - - isl_local_space *LocalSpace = isl_local_space_from_space(Space); - - for (int x = 0; x < scheduleDimensions; x++) { - int sX = x; - int tX = x; - int pX = scheduleDimensions + x; - int aX = 2 * scheduleDimensions + x; - int tileSize = (int)TileSizes.size() > x ? TileSizes[x] : DefaultTileSize; - assert(tileSize > 0 && "Invalid tile size"); - - isl_constraint *c; - - // sX = aX * tileSize; - c = isl_equality_alloc(isl_local_space_copy(LocalSpace)); - isl_constraint_set_coefficient_si(c, isl_dim_out, sX, 1); - isl_constraint_set_coefficient_si(c, isl_dim_out, aX, -tileSize); - tileMap = isl_basic_map_add_constraint(tileMap, c); - - // pX = sX; - c = isl_equality_alloc(isl_local_space_copy(LocalSpace)); - isl_constraint_set_coefficient_si(c, isl_dim_out, pX, 1); - isl_constraint_set_coefficient_si(c, isl_dim_in, sX, -1); - tileMap = isl_basic_map_add_constraint(tileMap, c); - - // tX <= pX - c = isl_inequality_alloc(isl_local_space_copy(LocalSpace)); - isl_constraint_set_coefficient_si(c, isl_dim_out, pX, 1); - isl_constraint_set_coefficient_si(c, isl_dim_out, tX, -1); - tileMap = isl_basic_map_add_constraint(tileMap, c); - - // pX <= tX + (tileSize - 1) - c = isl_inequality_alloc(isl_local_space_copy(LocalSpace)); - isl_constraint_set_coefficient_si(c, isl_dim_out, tX, 1); - isl_constraint_set_coefficient_si(c, isl_dim_out, pX, -1); - isl_constraint_set_constant_si(c, tileSize - 1); - tileMap = isl_basic_map_add_constraint(tileMap, c); - } - - // Project out auxilary dimensions. - // - // The auxilary dimensions are transformed into existentially quantified ones. - // This reduces the number of visible scattering dimensions and allows Cloog - // to produces better code. - tileMap = isl_basic_map_project_out( - tileMap, isl_dim_out, 2 * scheduleDimensions, scheduleDimensions); - isl_local_space_free(LocalSpace); - return tileMap; -} - -isl_union_map *IslScheduleOptimizer::getScheduleForBand(isl_band *Band, - int *Dimensions) { - isl_union_map *PartialSchedule; - isl_ctx *ctx; - isl_space *Space; - isl_basic_map *TileMap; - isl_union_map *TileUMap; - - PartialSchedule = isl_band_get_partial_schedule(Band); - *Dimensions = isl_band_n_member(Band); - - if (DisableTiling) - return PartialSchedule; - - // It does not make any sense to tile a band with just one dimension. - if (*Dimensions == 1) - return PartialSchedule; - - ctx = isl_union_map_get_ctx(PartialSchedule); - Space = isl_union_map_get_space(PartialSchedule); - - TileMap = getTileMap(ctx, *Dimensions); - TileUMap = isl_union_map_from_map(isl_map_from_basic_map(TileMap)); - TileUMap = isl_union_map_align_params(TileUMap, Space); - *Dimensions = 2 * *Dimensions; - - return isl_union_map_apply_range(PartialSchedule, TileUMap); -} - __isl_give isl_map * IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize, int ScheduleDimensions, int VectorWidth) { @@ -389,72 +249,74 @@ return TilingMap; } -isl_union_map * -IslScheduleOptimizer::getScheduleForBandList(isl_band_list *BandList) { - int NumBands; - isl_union_map *Schedule; - isl_ctx *ctx; - - ctx = isl_band_list_get_ctx(BandList); - NumBands = isl_band_list_n_band(BandList); - Schedule = isl_union_map_empty(isl_space_params_alloc(ctx, 0)); - - for (int i = 0; i < NumBands; i++) { - isl_band *Band; - isl_union_map *PartialSchedule; - int ScheduleDimensions; - isl_space *Space; - - Band = isl_band_list_get_band(BandList, i); - PartialSchedule = getScheduleForBand(Band, &ScheduleDimensions); - Space = isl_union_map_get_space(PartialSchedule); - - if (isl_band_has_children(Band)) { - isl_band_list *Children; - isl_union_map *SuffixSchedule; - - Children = isl_band_get_children(Band); - SuffixSchedule = getScheduleForBandList(Children); - PartialSchedule = - isl_union_map_flat_range_product(PartialSchedule, SuffixSchedule); - isl_band_list_free(Children); - } else if (PollyVectorizerChoice != VECTORIZER_NONE) { - // In case we are at the innermost band, we try to prepare for - // vectorization. This means, we look for the innermost parallel loop - // and strip mine this loop to the innermost level using a strip-mine - // factor corresponding to the number of vector iterations. - int NumDims = isl_band_n_member(Band); - for (int j = NumDims - 1; j >= 0; j--) { - if (isl_band_member_is_coincident(Band, j)) { - isl_map *TileMap; - isl_union_map *TileUMap; - - TileMap = getPrevectorMap(ctx, ScheduleDimensions - NumDims + j, - ScheduleDimensions); - TileUMap = isl_union_map_from_map(TileMap); - TileUMap = - isl_union_map_align_params(TileUMap, isl_space_copy(Space)); - PartialSchedule = - isl_union_map_apply_range(PartialSchedule, TileUMap); - break; - } - } - } +isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node, + void *User) { + if (isl_schedule_node_get_type(Node) != isl_schedule_node_band) + return Node; + + if (isl_schedule_node_n_children(Node) != 1) + return Node; - Schedule = isl_union_map_union(Schedule, PartialSchedule); + if (!isl_schedule_node_band_get_permutable(Node)) + return Node; - isl_band_free(Band); + auto Space = isl_schedule_node_band_get_space(Node); + auto Dims = isl_space_dim(Space, isl_dim_set); + + if (Dims <= 1) { isl_space_free(Space); + return Node; } - return Schedule; + auto Child = isl_schedule_node_get_child(Node, 0); + auto Type = isl_schedule_node_get_type(Child); + isl_schedule_node_free(Child); + + if (Type != isl_schedule_node_leaf) { + isl_space_free(Space); + return Node; + } + + auto Sizes = isl_multi_val_zero(Space); + auto Ctx = isl_schedule_node_get_ctx(Node); + + for (unsigned i = 0; i < Dims; i++) { + auto tileSize = TileSizes.size() > i ? TileSizes[i] : DefaultTileSize; + Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize)); + } + + auto Res = isl_schedule_node_band_tile(Node, Sizes); + + if (PollyVectorizerChoice == VECTORIZER_NONE) + return Res; + + Child = isl_schedule_node_get_child(Res, 0); + auto ChildSchedule = isl_schedule_node_band_get_partial_schedule(Child); + + for (int i = Dims - 1; i >= 0; i--) { + if (isl_schedule_node_band_member_get_coincident(Child, i)) { + auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims); + auto TileUMap = isl_union_map_from_map(TileMap); + auto ChildSchedule2 = isl_union_map_apply_range( + isl_union_map_from_multi_union_pw_aff(ChildSchedule), TileUMap); + ChildSchedule = isl_multi_union_pw_aff_from_union_map(ChildSchedule2); + break; + } + } + + isl_schedule_node_free(Res); + Res = isl_schedule_node_delete(Child); + Res = isl_schedule_node_insert_partial_schedule(Res, ChildSchedule); + return Res; } __isl_give isl_union_map * IslScheduleOptimizer::getScheduleMap(__isl_keep isl_schedule *Schedule) { - isl_band_list *BandList = isl_schedule_get_band_forest(Schedule); - isl_union_map *ScheduleMap = getScheduleForBandList(BandList); - isl_band_list_free(BandList); + isl_schedule_node *Root = isl_schedule_get_root(Schedule); + Root = isl_schedule_node_map_descendant( + Root, IslScheduleOptimizer::optimizeBand, NULL); + auto ScheduleMap = isl_schedule_node_get_subtree_schedule_union_map(Root); + isl_schedule_node_free(Root); return ScheduleMap; } @@ -618,15 +480,8 @@ Stmt->setScattering(StmtSchedule); } + isl_schedule_free(Schedule); isl_union_map_free(NewSchedule); - LastSchedule = Schedule; - - unsigned MaxScatDims = 0; - - for (ScopStmt *Stmt : S) - MaxScatDims = std::max(Stmt->getNumScattering(), MaxScatDims); - - extendScattering(S, MaxScatDims); return false; } Index: test/ScheduleOptimizer/line-tiling-2.ll =================================================================== --- test/ScheduleOptimizer/line-tiling-2.ll +++ test/ScheduleOptimizer/line-tiling-2.ll @@ -2,8 +2,8 @@ ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 511; c1 += 64) -; CHECK: for (int c3 = c1; c3 <= c1 + 63; c3 += 1) -; CHECK: Stmt_for_body3(c0, c3); +; CHECK: for (int c3 = 0; c3 <= 63; c3 += 1) +; CHECK: Stmt_for_body3(c0, c1 + c3); target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" Index: test/ScheduleOptimizer/line-tiling.ll =================================================================== --- test/ScheduleOptimizer/line-tiling.ll +++ test/ScheduleOptimizer/line-tiling.ll @@ -2,8 +2,8 @@ ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 64) ; CHECK: for (int c1 = 0; c1 <= 511; c1 += 1) -; CHECK: for (int c2 = c0; c2 <= c0 + 63; c2 += 1) -; CHECK: Stmt_for_body3(c2, c1); +; CHECK: for (int c2 = 0; c2 <= 63; c2 += 1) +; CHECK: Stmt_for_body3(c0 + c2, c1); target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" Index: test/ScheduleOptimizer/one-dimensional-band.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/one-dimensional-band.ll @@ -0,0 +1,107 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-ast -analyze < %s | FileCheck %s +; +; void jacobi1d(long T, long N, float *A, float *B) { +; long t, i, j; +; for (t = 0; t < T; t++) { +; for (i = 1; i < N - 1; i++) +; B[i] = 0.33333 * (A[i - 1] + A[i] + A[i + 1]); +; for (j = 1; j < N - 1; j++) +; A[j] = 0.33333 * (B[i - 1] + B[i] + B[i + 1]); +; } +; } + +; Verify that we do not tile bands that have just a single dimension. + +; CHECK: for (int c0 = 0; c0 < T; c0 += 1) { +; CHECK: for (int c2 = 0; c2 < N - 2; c2 += 1) +; CHECK: Stmt_for_body3(c0, c2); +; CHECK: for (int c2 = 0; c2 < N - 2; c2 += 1) +; CHECK: Stmt_for_body15(c0, c2); +; CHECK: } + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @jacobi1d(i64 %T, i64 %N, float* %A, float* %B) { +entry: + %tmp = add i64 %N, -1 + %tmp1 = icmp sgt i64 %tmp, 1 + %smax = select i1 %tmp1, i64 %tmp, i64 1 + br label %for.cond + +for.cond: ; preds = %for.inc30, %entry + %t.0 = phi i64 [ 0, %entry ], [ %inc31, %for.inc30 ] + %cmp = icmp slt i64 %t.0, %T + br i1 %cmp, label %for.body, label %for.end32 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ] + %sub = add nsw i64 %N, -1 + %cmp2 = icmp slt i64 %i.0, %sub + br i1 %cmp2, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %sub4 = add nsw i64 %i.0, -1 + %arrayidx = getelementptr inbounds float, float* %A, i64 %sub4 + %tmp2 = load float, float* %arrayidx, align 4 + %arrayidx5 = getelementptr inbounds float, float* %A, i64 %i.0 + %tmp3 = load float, float* %arrayidx5, align 4 + %add = fadd float %tmp2, %tmp3 + %add6 = add nuw nsw i64 %i.0, 1 + %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add6 + %tmp4 = load float, float* %arrayidx7, align 4 + %add8 = fadd float %add, %tmp4 + %conv = fpext float %add8 to double + %mul = fmul double %conv, 3.333300e-01 + %conv9 = fptrunc double %mul to float + %arrayidx10 = getelementptr inbounds float, float* %B, i64 %i.0 + store float %conv9, float* %arrayidx10, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %inc = add nuw nsw i64 %i.0, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.cond11 + +for.cond11: ; preds = %for.inc27, %for.end + %j.0 = phi i64 [ 1, %for.end ], [ %inc28, %for.inc27 ] + %sub12 = add nsw i64 %N, -1 + %cmp13 = icmp slt i64 %j.0, %sub12 + br i1 %cmp13, label %for.body15, label %for.end29 + +for.body15: ; preds = %for.cond11 + %sub16 = add nsw i64 %smax, -1 + %arrayidx17 = getelementptr inbounds float, float* %B, i64 %sub16 + %tmp5 = load float, float* %arrayidx17, align 4 + %arrayidx18 = getelementptr inbounds float, float* %B, i64 %smax + %tmp6 = load float, float* %arrayidx18, align 4 + %add19 = fadd float %tmp5, %tmp6 + %add20 = add nsw i64 %smax, 1 + %arrayidx21 = getelementptr inbounds float, float* %B, i64 %add20 + %tmp7 = load float, float* %arrayidx21, align 4 + %add22 = fadd float %add19, %tmp7 + %conv23 = fpext float %add22 to double + %mul24 = fmul double %conv23, 3.333300e-01 + %conv25 = fptrunc double %mul24 to float + %arrayidx26 = getelementptr inbounds float, float* %A, i64 %j.0 + store float %conv25, float* %arrayidx26, align 4 + br label %for.inc27 + +for.inc27: ; preds = %for.body15 + %inc28 = add nuw nsw i64 %j.0, 1 + br label %for.cond11 + +for.end29: ; preds = %for.cond11 + br label %for.inc30 + +for.inc30: ; preds = %for.end29 + %inc31 = add nuw nsw i64 %t.0, 1 + br label %for.cond + +for.end32: ; preds = %for.cond + ret void +} Index: test/ScheduleOptimizer/prevectorization.ll =================================================================== --- test/ScheduleOptimizer/prevectorization.ll +++ test/ScheduleOptimizer/prevectorization.ll @@ -58,21 +58,21 @@ ; CHECK: #pragma known-parallel ; CHECK: for (int c1 = 0; c1 <= 1535; c1 += 32) ; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 32) -; CHECK: for (int c3 = c1; c3 <= c1 + 31; c3 += 1) -; CHECK: for (int c4 = c2; c4 <= c2 + 31; c4 += 4) +; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1) +; CHECK: for (int c4 = 0; c4 <= 31; c4 += 4) ; CHECK: #pragma simd ; CHECK: for (int c5 = c4; c5 <= c4 + 3; c5 += 1) -; CHECK: Stmt_for_body3(c3, c5); +; CHECK: Stmt_for_body3(c1 + c3, c2 + c5); ; CHECK: #pragma known-parallel ; CHECK: for (int c1 = 0; c1 <= 1535; c1 += 32) ; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 32) ; CHECK: for (int c3 = 0; c3 <= 1535; c3 += 32) -; CHECK: for (int c4 = c1; c4 <= c1 + 31; c4 += 1) -; CHECK: for (int c5 = c2; c5 <= c2 + 31; c5 += 4) -; CHECK: for (int c6 = c3; c6 <= c3 + 31; c6 += 1) +; CHECK: for (int c4 = 0; c4 <= 31; c4 += 1) +; CHECK: for (int c5 = 0; c5 <= 31; c5 += 4) +; CHECK: for (int c6 = 0; c6 <= 31; c6 += 1) ; CHECK: #pragma simd ; CHECK: for (int c7 = c5; c7 <= c5 + 3; c7 += 1) -; CHECK: Stmt_for_body8(c4, c7, c6); +; CHECK: Stmt_for_body8(c1 + c4, c2 + c7, c3 + c6); !llvm.ident = !{!0} Index: test/ScheduleOptimizer/rectangular-tiling.ll =================================================================== --- test/ScheduleOptimizer/rectangular-tiling.ll +++ test/ScheduleOptimizer/rectangular-tiling.ll @@ -2,9 +2,9 @@ ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 256) ; CHECK: for (int c1 = 0; c1 <= 511; c1 += 16) -; CHECK: for (int c2 = c0; c2 <= c0 + 255; c2 += 1) -; CHECK: for (int c3 = c1; c3 <= c1 + 15; c3 += 1) -; CHECK: Stmt_for_body3(c2, c3); +; CHECK: for (int c2 = 0; c2 <= 255; c2 += 1) +; CHECK: for (int c3 = 0; c3 <= 15; c3 += 1) +; CHECK: Stmt_for_body3(c0 + c2, c1 + c3); target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"