Index: polly/trunk/lib/External/pet/include/pet.h =================================================================== --- polly/trunk/lib/External/pet/include/pet.h +++ polly/trunk/lib/External/pet/include/pet.h @@ -19,6 +19,9 @@ struct pet_options; ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args) +/* Create an isl_ctx that references the pet options. */ +isl_ctx *isl_ctx_alloc_with_pet_options(); + /* If autodetect is set, any valid scop is extracted. * Otherwise, the scop needs to be delimited by pragmas. */ @@ -74,6 +77,9 @@ pet_op_sub_assign, pet_op_mul_assign, pet_op_div_assign, + pet_op_and_assign, + pet_op_xor_assign, + pet_op_or_assign, pet_op_assign, pet_op_add, pet_op_sub, @@ -163,18 +169,20 @@ __isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index); /* Does "expr" represent an affine expression? */ -int pet_expr_is_affine(__isl_keep pet_expr *expr); +isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr); /* Does the access expression "expr" read the accessed elements? */ -int pet_expr_access_is_read(__isl_keep pet_expr *expr); +isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr); /* Does the access expression "expr" write to the accessed elements? */ -int pet_expr_access_is_write(__isl_keep pet_expr *expr); -/* Mark "expr" as a read dependening on "read". */ +isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr); +/* Does the access expression "expr" kill the accessed elements? */ +isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr); +/* Mark "expr" as a read depending on "read". */ __isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr, int read); -/* Mark "expr" as a write dependening on "write". */ +/* Mark "expr" as a write depending on "write". */ __isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr, int write); -/* Mark "expr" as a kill dependening on "kill". */ +/* Mark "expr" as a kill depending on "kill". */ __isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr, int kill); /* Return the reference identifier of access expression "expr". */ @@ -278,7 +286,8 @@ pet_tree_if_else, /* An if with an else branch */ pet_tree_for, pet_tree_infinite_loop, - pet_tree_while + pet_tree_while, + pet_tree_return, }; struct pet_tree; @@ -301,6 +310,9 @@ /* Return the expression of the expression tree "tree". */ __isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree); +/* Return the expression returned by the return tree "tree". */ +__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree); + /* Return the number of children of the block tree "tree". */ int pet_tree_block_n_child(__isl_keep pet_tree *tree); /* Return child "pos" of the block tree "tree". */ @@ -420,7 +432,7 @@ * this array has a valid (i.e., non-negative) size * * extent holds constraints on the indices - * + * * value_bounds holds constraints on the elements of the array * and may be NULL if no such constraints were specified by the user * @@ -436,6 +448,8 @@ * * declared is set if the array was declared somewhere inside the scop. * exposed is set if the declared array is visible outside the scop. + * outer is set if the type of the array elements is a record and + * the fields of this record are represented by separate pet_array structures. */ struct pet_array { isl_set *context; @@ -448,6 +462,7 @@ int uniquely_defined; int declared; int exposed; + int outer; }; /* This structure represents an implication on a boolean filter. @@ -517,6 +532,7 @@ int n_independence; struct pet_independence **independences; }; +typedef struct pet_scop pet_scop; /* Return a textual representation of the operator. */ const char *pet_op_str(enum pet_op_type op); @@ -526,7 +542,7 @@ * If function is not NULL, then the pet_scop is extracted from * a function with that name. */ -struct pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx, +__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx, const char *filename, const char *function); /* Transform the C source file "input" by rewriting each scop @@ -535,63 +551,69 @@ */ int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output, __isl_give isl_printer *(*transform)(__isl_take isl_printer *p, - struct pet_scop *scop, void *user), void *user); + __isl_take pet_scop *scop, void *user), void *user); /* Given a scop and a printer passed to a pet_transform_C_source callback, * print the original corresponding code to the printer. */ -__isl_give isl_printer *pet_scop_print_original(struct pet_scop *scop, +__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop, __isl_take isl_printer *p); /* Update all isl_sets and isl_maps such that they all have the same * parameters in the same order. */ -struct pet_scop *pet_scop_align_params(struct pet_scop *scop); +__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop); /* Does "scop" contain any data dependent accesses? */ -int pet_scop_has_data_dependent_accesses(struct pet_scop *scop); +int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop); /* Does "scop" contain any data dependent conditions? */ -int pet_scop_has_data_dependent_conditions(struct pet_scop *scop); +int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop); /* pet_stmt_build_ast_exprs is currently limited to only handle * some forms of data dependent accesses. * If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs * can safely be called on all statements in the scop. */ -int pet_scop_can_build_ast_exprs(struct pet_scop *scop); +int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop); -void pet_scop_dump(struct pet_scop *scop); -struct pet_scop *pet_scop_free(struct pet_scop *scop); +void pet_scop_dump(__isl_keep pet_scop *scop); +__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop); -__isl_give isl_union_set *pet_scop_collect_domains(struct pet_scop *scop); -/* Collect all potential read access relations. */ -__isl_give isl_union_map *pet_scop_collect_may_reads(struct pet_scop *scop); -/* Collect all tagged potential read access relations. */ -__isl_give isl_union_map *pet_scop_collect_tagged_may_reads( - struct pet_scop *scop); -/* Collect all potential write access relations. */ -__isl_give isl_union_map *pet_scop_collect_may_writes(struct pet_scop *scop); -/* Collect all definite write access relations. */ -__isl_give isl_union_map *pet_scop_collect_must_writes(struct pet_scop *scop); -/* Collect all tagged potential write access relations. */ -__isl_give isl_union_map *pet_scop_collect_tagged_may_writes( - struct pet_scop *scop); -/* Collect all tagged definite write access relations. */ -__isl_give isl_union_map *pet_scop_collect_tagged_must_writes( - struct pet_scop *scop); -/* Collect all definite kill access relations. */ -__isl_give isl_union_map *pet_scop_collect_must_kills(struct pet_scop *scop); -/* Collect all tagged definite kill access relations. */ -__isl_give isl_union_map *pet_scop_collect_tagged_must_kills( - struct pet_scop *scop); +/* Return the context of "scop". */ +__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop); +/* Return the schedule of "scop". */ +__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop); +/* Return the set of all statement instances. */ +__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop); +/* Return the potential read access relation. */ +__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop); +/* Return the tagged potential read access relation. */ +__isl_give isl_union_map *pet_scop_get_tagged_may_reads( + __isl_keep pet_scop *scop); +/* Return the potential write access relation. */ +__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop); +/* Return the definite write access relation. */ +__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop); +/* Return the tagged potential write access relation. */ +__isl_give isl_union_map *pet_scop_get_tagged_may_writes( + __isl_keep pet_scop *scop); +/* Return the tagged definite write access relation. */ +__isl_give isl_union_map *pet_scop_get_tagged_must_writes( + __isl_keep pet_scop *scop); +/* Return the definite kill access relation. */ +__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop); +/* Return the tagged definite kill access relation. */ +__isl_give isl_union_map *pet_scop_get_tagged_must_kills( + __isl_keep pet_scop *scop); /* Compute a mapping from all outermost arrays (of structs) in scop * to their innermost members. */ __isl_give isl_union_map *pet_scop_compute_outer_to_inner( - struct pet_scop *scop); + __isl_keep pet_scop *scop); /* Compute a mapping from all outermost arrays (of structs) in scop * to their members, including the outermost arrays themselves. */ -__isl_give isl_union_map *pet_scop_compute_outer_to_any(struct pet_scop *scop); +__isl_give isl_union_map *pet_scop_compute_outer_to_any( + __isl_keep pet_scop *scop); #if defined(__cplusplus) } Index: polly/trunk/lib/External/ppcg/ChangeLog =================================================================== --- polly/trunk/lib/External/ppcg/ChangeLog +++ polly/trunk/lib/External/ppcg/ChangeLog @@ -1,3 +1,25 @@ +version: 0.07 +date: Tue Feb 7 17:23:22 CET 2017 +changes: + - support hybrid tiling +--- +version: 0.06 +date: Fri May 6 12:08:50 CEST 2016 +changes: + - use PPCG specific macro names in generated code + - complete transition to schedule trees + - maximize coincidence by default + - map arrays with constant index expressions to private memory + - optionally group chains of statements +--- +version: 0.05 +date: Fri Jan 15 09:30:23 CET 2016 +changes: + - fix live-out computation + - optionally compute schedule for C target + - optionally perform tiling for C target + - create single kernel for non-permutable subtree +--- version: 0.04 date: Wed Jun 17 10:52:58 CEST 2015 changes: Index: polly/trunk/lib/External/ppcg/GIT_HEAD_ID =================================================================== --- polly/trunk/lib/External/ppcg/GIT_HEAD_ID +++ polly/trunk/lib/External/ppcg/GIT_HEAD_ID @@ -1 +1 @@ -ppcg-0.04 +ppcg-0.07 Index: polly/trunk/lib/External/ppcg/Makefile.am =================================================================== --- polly/trunk/lib/External/ppcg/Makefile.am +++ polly/trunk/lib/External/ppcg/Makefile.am @@ -40,10 +40,15 @@ gpu_array_tile.h \ gpu_group.c \ gpu_group.h \ + gpu_hybrid.c \ + gpu_hybrid.h \ gpu_print.c \ gpu_print.h \ gpu_tree.c \ gpu_tree.h \ + grouping.c \ + hybrid.c \ + hybrid.h \ schedule.c \ schedule.h \ ppcg_options.c \ @@ -52,6 +57,7 @@ ppcg.h \ print.c \ print.h \ + util.c \ util.h \ version.c Index: polly/trunk/lib/External/ppcg/README =================================================================== --- polly/trunk/lib/External/ppcg/README +++ polly/trunk/lib/External/ppcg/README @@ -9,7 +9,7 @@ (only needed if you want to compile the pet executable) - LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html) Unless you have some other reasons for wanting to use the svn version, - it is best to install the latest release (3.6). + it is best to install the latest release (3.9). For more details, see pet/README. If you are installing on Ubuntu, then you can install the following packages: @@ -30,8 +30,7 @@ git clone git://repo.or.cz/ppcg.git cd ppcg - git submodule init - git submodule update + ./get_submodules.sh ./autogen.sh @@ -83,6 +82,11 @@ The dimension of the "tile" space indicates the (maximal) number of loop dimensions to tile. The elements of the single integer tuple specify the tile sizes in each dimension. +In case of hybrid tiling, the first element is half the size of +the tile in the time (sequential) dimension. The second element +specifies the number of elements in the base of the hexagon. +The remaining elements specify the tile sizes in the remaining space +dimensions. The dimension of the "grid" space indicates the (maximal) number of block dimensions in the grid. The elements of the single integer tuple @@ -170,6 +174,17 @@ PPCG from generating type definitions. +GNU extensions + +By default, PPCG may print out macro definitions that involve +GNU extensions such as __typeof__ and statement expressions. +Some compilers may not support these extensions. +In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918) +has been reported not to support __typeof__. +The use of these extensions can be turned off with the +--no-allow-gnu-extensions option. + + Processing PolyBench When processing a PolyBench/C 3.2 benchmark, you should always specify @@ -200,6 +215,11 @@ For bug reports, feature requests and questions, contact http://groups.google.com/group/isl-development +Whenever you report a bug, please mention the exact version of PPCG +that you are using (output of "./ppcg --version"). If you are unable +to compile PPCG, then report the git version (output of "git describe") +or the version number included in the name of the tarball. + Citing PPCG Index: polly/trunk/lib/External/ppcg/configure.ac =================================================================== --- polly/trunk/lib/External/ppcg/configure.ac +++ polly/trunk/lib/External/ppcg/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ppcg], [0.04], [isl-development@googlegroups.com]) +AC_INIT([ppcg], [0.07], [isl-development@googlegroups.com]) AC_CONFIG_AUX_DIR([.]) AC_CONFIG_MACRO_DIR([m4]) AM_INIT_AUTOMAKE([foreign]) @@ -25,6 +25,7 @@ ISL_CFLAGS="$ISL_CFLAGS" ppcg_configure_args="$ppcg_configure_args --with-isl-builddir=../isl" ppcg_configure_args="$ppcg_configure_args --with-isl=build" + ppcg_configure_args="$ppcg_configure_args --with-clang=system" ;; build) ISL_BUILDDIR=`echo @abs_builddir@ | $with_isl_builddir/config.status --file=-` Index: polly/trunk/lib/External/ppcg/cpu.c =================================================================== --- polly/trunk/lib/External/ppcg/cpu.c +++ polly/trunk/lib/External/ppcg/cpu.c @@ -1,11 +1,14 @@ /* * Copyright 2012 INRIA Paris-Rocquencourt + * Copyright 2012 Ecole Normale Superieure * * Use of this software is governed by the MIT license * * Written by Tobias Grosser, INRIA Paris-Rocquencourt, * Domaine de Voluceau, Rocquenqourt, B.P. 105, * 78153 Le Chesnay Cedex France + * and Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France */ #include @@ -14,14 +17,19 @@ #include #include +#include #include #include +#include +#include #include #include "ppcg.h" #include "ppcg_options.h" #include "cpu.h" #include "print.h" +#include "schedule.h" +#include "util.h" /* Representation of a statement inside a generated AST. * @@ -39,7 +47,6 @@ static void ppcg_stmt_free(void *user) { struct ppcg_stmt *stmt = user; - int i; if (!stmt) return; @@ -118,7 +125,7 @@ static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build, struct ppcg_scop *scop) { - isl_union_map *schedule_node, *schedule, *deps; + isl_union_map *schedule, *deps; isl_map *schedule_deps, *test; isl_space *schedule_space; unsigned i, dimension, is_parallel; @@ -228,8 +235,10 @@ * that is marked as openmp parallel. * */ -static __isl_give isl_ast_node *ast_build_after_for(__isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build, void *user) { +static __isl_give isl_ast_node *ast_build_after_for( + __isl_take isl_ast_node *node, __isl_keep isl_ast_build *build, + void *user) +{ isl_id *id; struct ast_build_userinfo *build_info; struct ast_node_userinfo *info; @@ -327,7 +336,6 @@ __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { - struct ppcg_print_info *print_info; isl_id *id; int openmp; @@ -416,29 +424,75 @@ return isl_ast_node_free(node); } -/* Set *depth to the number of scheduling dimensions - * for the schedule of the first domain. - * We assume here that this number is the same for all domains. +/* Set *depth (initialized to 0 by the caller) to the maximum + * of the schedule depths of the leaf nodes for which this function is called. */ -static isl_stat set_depth(__isl_take isl_map *map, void *user) +static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user) { - unsigned *depth = user; + int *depth = user; + int node_depth; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf) + return isl_bool_true; + node_depth = isl_schedule_node_get_schedule_depth(node); + if (node_depth > *depth) + *depth = node_depth; + + return isl_bool_false; +} + +/* This function is called for each node in a CPU AST. + * In case of a user node, print the macro definitions required + * for printing the AST expressions in the annotation, if any. + * For other nodes, return true such that descendants are also + * visited. + * + * In particular, print the macro definitions needed for the substitutions + * of the original user statements. + */ +static isl_bool at_node(__isl_keep isl_ast_node *node, void *user) +{ + struct ppcg_stmt *stmt; + isl_id *id; + isl_printer **p = user; - *depth = isl_map_dim(map, isl_dim_out); + if (isl_ast_node_get_type(node) != isl_ast_node_user) + return isl_bool_true; - isl_map_free(map); - return isl_stat_error; + id = isl_ast_node_get_annotation(node); + stmt = isl_id_get_user(id); + isl_id_free(id); + + if (!stmt) + return isl_bool_error; + + *p = ppcg_print_body_macros(*p, stmt->ref2expr); + if (!*p) + return isl_bool_error; + + return isl_bool_false; +} + +/* Print the required macros for the CPU AST "node" to "p", + * including those needed for the user statements inside the AST. + */ +static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node) +{ + if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0) + return isl_printer_free(p); + p = ppcg_print_macros(p, node); + return p; } -/* Code generate the scop 'scop' and print the corresponding C code to 'p'. +/* Code generate the scop 'scop' using "schedule" + * and print the corresponding C code to 'p'. */ static __isl_give isl_printer *print_scop(struct ppcg_scop *scop, - __isl_take isl_printer *p, struct ppcg_options *options) + __isl_take isl_schedule *schedule, __isl_take isl_printer *p, + struct ppcg_options *options) { isl_ctx *ctx = isl_printer_get_ctx(p); - isl_set *context; - isl_union_set *domain_set; - isl_union_map *schedule_map; isl_ast_build *build; isl_ast_print_options *print_options; isl_ast_node *tree; @@ -446,14 +500,12 @@ struct ast_build_userinfo build_info; int depth; - context = isl_set_copy(scop->context); - domain_set = isl_union_set_copy(scop->domain); - schedule_map = isl_schedule_get_map(scop->schedule); - schedule_map = isl_union_map_intersect_domain(schedule_map, domain_set); - - isl_union_map_foreach_map(schedule_map, &set_depth, &depth); + depth = 0; + if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth, + &depth) < 0) + goto error; - build = isl_ast_build_from_context(context); + build = isl_ast_build_alloc(ctx); iterators = ppcg_scop_generate_names(scop, depth, "c"); build = isl_ast_build_set_iterators(build, iterators); build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop); @@ -470,7 +522,7 @@ &build_info); } - tree = isl_ast_build_node_from_schedule_map(build, schedule_map); + tree = isl_ast_build_node_from_schedule(build, schedule); isl_ast_build_free(build); print_options = isl_ast_print_options_alloc(ctx); @@ -480,21 +532,188 @@ print_options = isl_ast_print_options_set_print_for(print_options, &print_for, NULL); - p = ppcg_print_macros(p, tree); + p = cpu_print_macros(p, tree); p = isl_ast_node_print(tree, p, print_options); isl_ast_node_free(tree); return p; +error: + isl_schedule_free(schedule); + isl_printer_free(p); + return NULL; } -/* Generate CPU code for the scop "ps" and print the corresponding C code - * to "p", including variable declarations. +/* Tile the band node "node" with tile sizes "sizes" and + * mark all members of the resulting tile node as "atomic". */ -__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, - struct ppcg_scop *ps, struct ppcg_options *options) +static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node, + __isl_take isl_multi_val *sizes) +{ + node = isl_schedule_node_band_tile(node, sizes); + node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); + + return node; +} + +/* Tile "node", if it is a band node with at least 2 members. + * The tile sizes are set from the "tile_size" option. + */ +static __isl_give isl_schedule_node *tile_band( + __isl_take isl_schedule_node *node, void *user) +{ + struct ppcg_scop *scop = user; + int n; + isl_space *space; + isl_multi_val *sizes; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return node; + + n = isl_schedule_node_band_n_member(node); + if (n <= 1) + return node; + + space = isl_schedule_node_band_get_space(node); + sizes = ppcg_multi_val_from_int(space, scop->options->tile_size); + + return tile(node, sizes); +} + +/* Construct schedule constraints from the dependences in ps + * for the purpose of computing a schedule for a CPU. + * + * The proximity constraints are set to the flow dependences. + * + * If live-range reordering is allowed then the conditional validity + * constraints are set to the order dependences with the flow dependences + * as condition. That is, a live-range (flow dependence) will be either + * local to an iteration of a band or all adjacent order dependences + * will be respected by the band. + * The validity constraints are set to the union of the flow dependences + * and the forced dependences, while the coincidence constraints + * are set to the union of the flow dependences, the forced dependences and + * the order dependences. + * + * If live-range reordering is not allowed, then both the validity + * and the coincidence constraints are set to the union of the flow + * dependences and the false dependences. + * + * Note that the coincidence constraints are only set when the "openmp" + * options is set. Even though the way openmp pragmas are introduced + * does not rely on the coincident property of the schedule band members, + * the coincidence constraints do affect the way the schedule is constructed, + * such that more schedule dimensions should be detected as parallel + * by ast_schedule_dim_is_parallel. + * Since the order dependences are also taken into account by + * ast_schedule_dim_is_parallel, they are also added to + * the coincidence constraints. If the openmp handling learns + * how to privatize some memory, then the corresponding order + * dependences can be removed from the coincidence constraints. + */ +static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints( + struct ppcg_scop *ps) +{ + isl_schedule_constraints *sc; + isl_union_map *validity, *coincidence; + + sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain)); + if (ps->options->live_range_reordering) { + sc = isl_schedule_constraints_set_conditional_validity(sc, + isl_union_map_copy(ps->tagged_dep_flow), + isl_union_map_copy(ps->tagged_dep_order)); + validity = isl_union_map_copy(ps->dep_flow); + validity = isl_union_map_union(validity, + isl_union_map_copy(ps->dep_forced)); + if (ps->options->openmp) { + coincidence = isl_union_map_copy(validity); + coincidence = isl_union_map_union(coincidence, + isl_union_map_copy(ps->dep_order)); + } + } else { + validity = isl_union_map_copy(ps->dep_flow); + validity = isl_union_map_union(validity, + isl_union_map_copy(ps->dep_false)); + if (ps->options->openmp) + coincidence = isl_union_map_copy(validity); + } + if (ps->options->openmp) + sc = isl_schedule_constraints_set_coincidence(sc, coincidence); + sc = isl_schedule_constraints_set_validity(sc, validity); + sc = isl_schedule_constraints_set_proximity(sc, + isl_union_map_copy(ps->dep_flow)); + + return sc; +} + +/* Compute a schedule for the scop "ps". + * + * First derive the appropriate schedule constraints from the dependences + * in "ps" and then compute a schedule from those schedule constraints, + * possibly grouping statement instances based on the input schedule. + */ +static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps) +{ + isl_schedule_constraints *sc; + isl_schedule *schedule; + + if (!ps) + return NULL; + + sc = construct_cpu_schedule_constraints(ps); + + if (ps->options->debug->dump_schedule_constraints) + isl_schedule_constraints_dump(sc); + schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options); + + return schedule; +} + +/* Compute a new schedule to the scop "ps" if the reschedule option is set. + * Otherwise, return a copy of the original schedule. + */ +static __isl_give isl_schedule *optionally_compute_schedule(void *user) +{ + struct ppcg_scop *ps = user; + + if (!ps) + return NULL; + if (!ps->options->reschedule) + return isl_schedule_copy(ps->schedule); + return compute_cpu_schedule(ps); +} + +/* Compute a schedule based on the dependences in "ps" and + * tile it if requested by the user. + */ +static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps, + struct ppcg_options *options) +{ + isl_ctx *ctx; + isl_schedule *schedule; + + if (!ps) + return NULL; + + ctx = isl_union_set_get_ctx(ps->domain); + schedule = ppcg_get_schedule(ctx, options, + &optionally_compute_schedule, ps); + if (ps->options->tile) + schedule = isl_schedule_map_schedule_node_bottom_up(schedule, + &tile_band, ps); + + return schedule; +} + +/* Generate CPU code for the scop "ps" using "schedule" and + * print the corresponding C code to "p", including variable declarations. + */ +static __isl_give isl_printer *print_cpu_with_schedule( + __isl_take isl_printer *p, struct ppcg_scop *ps, + __isl_take isl_schedule *schedule, struct ppcg_options *options) { int hidden; + isl_set *context; p = isl_printer_start_line(p); p = isl_printer_print_str(p, "/* ppcg generated CPU code */"); @@ -503,30 +722,61 @@ p = isl_printer_start_line(p); p = isl_printer_end_line(p); - p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p); + p = ppcg_set_macro_names(p); p = ppcg_print_exposed_declarations(p, ps); hidden = ppcg_scop_any_hidden_declarations(ps); if (hidden) { p = ppcg_start_block(p); p = ppcg_print_hidden_declarations(p, ps); } + + context = isl_set_copy(ps->context); + context = isl_set_from_params(context); + schedule = isl_schedule_insert_context(schedule, context); if (options->debug->dump_final_schedule) - isl_schedule_dump(ps->schedule); - p = print_scop(ps, p, options); + isl_schedule_dump(schedule); + p = print_scop(ps, schedule, p, options); if (hidden) p = ppcg_end_block(p); return p; } -/* Wrapper around print_cpu for use as a ppcg_transform callback. +/* Generate CPU code for the scop "ps" and print the corresponding C code + * to "p", including variable declarations. + */ +__isl_give isl_printer *print_cpu(__isl_take isl_printer *p, + struct ppcg_scop *ps, struct ppcg_options *options) +{ + isl_schedule *schedule; + + schedule = isl_schedule_copy(ps->schedule); + return print_cpu_with_schedule(p, ps, schedule, options); +} + +/* Generate CPU code for "scop" and print it to "p". + * + * First obtain a schedule for "scop" and then print code for "scop" + * using that schedule. + */ +static __isl_give isl_printer *generate(__isl_take isl_printer *p, + struct ppcg_scop *scop, struct ppcg_options *options) +{ + isl_schedule *schedule; + + schedule = get_schedule(scop, options); + + return print_cpu_with_schedule(p, scop, schedule, options); +} + +/* Wrapper around generate for use as a ppcg_transform callback. */ static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p, struct ppcg_scop *scop, void *user) { struct ppcg_options *options = user; - return print_cpu(p, scop, options); + return generate(p, scop, options); } /* Transform the code in the file called "input" by replacing Index: polly/trunk/lib/External/ppcg/cuda.h =================================================================== --- polly/trunk/lib/External/ppcg/cuda.h +++ polly/trunk/lib/External/ppcg/cuda.h @@ -6,8 +6,5 @@ int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, const char *input); -__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *print_options, - __isl_keep isl_ast_node *node, void *user); #endif Index: polly/trunk/lib/External/ppcg/cuda.c =================================================================== --- polly/trunk/lib/External/ppcg/cuda.c +++ polly/trunk/lib/External/ppcg/cuda.c @@ -56,9 +56,13 @@ if (!array->linearize && array->n_index > 1) { p = isl_printer_print_str(p, ")"); for (i = 1; i < array->n_index; i++) { + isl_ast_expr *bound; + bound = isl_ast_expr_get_op_arg(array->bound_expr, + 1 + i); p = isl_printer_print_str(p, "["); - p = isl_printer_print_pw_aff(p, array->bound[i]); + p = isl_printer_print_ast_expr(p, bound); p = isl_printer_print_str(p, "]"); + isl_ast_expr_free(bound); } } p = isl_printer_print_str(p, ";"); @@ -89,8 +93,11 @@ int i; for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + if (!gpu_array_requires_device_allocation(&prog->array[i])) continue; + p = ppcg_ast_expr_print_macros(array->bound_expr, p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "cudaCheckReturn(cudaMalloc((void **) &dev_"); @@ -105,6 +112,24 @@ return p; } +static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_"); + p = isl_printer_print_str(p, prog->array[i].name); + p = isl_printer_print_str(p, "));"); + p = isl_printer_end_line(p); + } + + return p; +} + /* Print code to "p" for copying "array" from the host to the device * in its entirety. The bounds on the extent of "array" have * been precomputed in extract_array_info and are used in @@ -153,20 +178,20 @@ return p; } -static isl_printer *print_reverse_list(isl_printer *p, int len, int *list) +static void print_reverse_list(FILE *out, int len, int *list) { int i; - if (len == 0) - return p; + if (!out || len == 0) + return; - p = isl_printer_print_str(p, "("); + fprintf(out, "("); for (i = 0; i < len; ++i) { if (i) - p = isl_printer_print_str(p, ", "); - p = isl_printer_print_int(p, list[len - 1 - i]); + fprintf(out, ", "); + fprintf(out, "%d", list[len - 1 - i]); } - return isl_printer_print_str(p, ")"); + fprintf(out, ")"); } /* Print the effective grid size as a list of the sizes in each @@ -184,11 +209,11 @@ p = isl_printer_print_str(p, "("); for (i = dim - 1; i >= 0; --i) { - isl_pw_aff *bound; + isl_ast_expr *bound; - bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i); - p = isl_printer_print_pw_aff(p, bound); - isl_pw_aff_free(bound); + bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i); + p = isl_printer_print_ast_expr(p, bound); + isl_ast_expr_free(bound); if (i > 0) p = isl_printer_print_str(p, ", "); @@ -469,8 +494,8 @@ p = print_kernel_vars(p, kernel); p = isl_printer_end_line(p); - p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p); - p = ppcg_print_macros(p, kernel->tree); + p = ppcg_set_macro_names(p); + p = gpu_print_macros(p, kernel->tree); print_options = isl_ast_print_options_alloc(ctx); print_options = isl_ast_print_options_set_print_user(print_options, @@ -481,15 +506,46 @@ fprintf(cuda->kernel_c, "}\n"); } -/* Print a statement for copying an array to or from the device. - * The statement identifier is called "to_device_" or - * "from_device_" and its user pointer points - * to the gpu_array_info of the array that needs to be copied. +/* Print code for initializing the device for execution of the transformed + * code. This includes declaring locally defined variables as well as + * declaring and allocating the required copies of arrays on the device. + */ +static __isl_give isl_printer *init_device(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + p = print_cuda_macros(p); + + p = gpu_print_local_declarations(p, prog); + p = declare_device_arrays(p, prog); + p = allocate_device_arrays(p, prog); + + return p; +} + +/* Print code for clearing the device after execution of the transformed code. + * In particular, free the memory that was allocated on the device. + */ +static __isl_give isl_printer *clear_device(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + p = free_device_arrays(p, prog); + + return p; +} + +/* Print a statement for copying an array to or from the device, + * or for initializing or clearing the device. + * The statement identifier of a copying node is called + * "to_device_" or "from_device_" and + * its user pointer points to the gpu_array_info of the array + * that needs to be copied. + * The node for initializing the device is called "init_device". + * The node for clearing the device is called "clear_device". * - * Extract the array from the identifier and call - * copy_array_to_device or copy_array_from_device. + * Extract the array (if any) from the identifier and call + * init_device, clear_device, copy_array_to_device or copy_array_from_device. */ -static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p, +static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p, __isl_keep isl_ast_node *node, struct gpu_prog *prog) { isl_ast_expr *expr, *arg; @@ -507,7 +563,11 @@ isl_ast_expr_free(expr); if (!name) - array = NULL; + return isl_printer_free(p); + if (!strcmp(name, "init_device")) + return init_device(p, prog); + if (!strcmp(name, "clear_device")) + return clear_device(p, prog); if (!array) return isl_printer_free(p); @@ -524,17 +584,18 @@ /* Print the user statement of the host code to "p". * - * The host code may contain original user statements, kernel launches and - * statements that copy data to/from the device. + * The host code may contain original user statements, kernel launches, + * statements that copy data to/from the device and statements + * the initialize or clear the device. * The original user statements and the kernel launches have - * an associated annotation, while the data copy statements do not. - * The latter are handled by print_to_from_device. + * an associated annotation, while the other statements do not. + * The latter are handled by print_device_node. * The annotation on the user statements is called "user". * * In case of a kernel launch, print a block of statements that * defines the grid and the block and then launches the kernel. */ -__isl_give isl_printer *print_host_user(__isl_take isl_printer *p, +static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, __isl_take isl_ast_print_options *print_options, __isl_keep isl_ast_node *node, void *user) { @@ -550,7 +611,7 @@ id = isl_ast_node_get_annotation(node); if (!id) - return print_to_from_device(p, node, data->prog); + return print_device_node(p, node, data->prog); is_user = !strcmp(isl_id_get_name(id), "user"); kernel = is_user ? NULL : isl_id_get_user(id); @@ -560,16 +621,14 @@ if (is_user) return ppcg_kernel_print_domain(p, stmt); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "{"); - p = isl_printer_end_line(p); - p = isl_printer_indent(p, 2); + p = ppcg_start_block(p); p = isl_printer_start_line(p); p = isl_printer_print_str(p, "dim3 k"); p = isl_printer_print_int(p, kernel->id); p = isl_printer_print_str(p, "_dimBlock"); - p = print_reverse_list(p, kernel->n_block, kernel->block_dim); + print_reverse_list(isl_printer_get_file(p), + kernel->n_block, kernel->block_dim); p = isl_printer_print_str(p, ";"); p = isl_printer_end_line(p); @@ -591,17 +650,12 @@ p = isl_printer_print_str(p, "cudaCheckKernel();"); p = isl_printer_end_line(p); - p = isl_printer_indent(p, -2); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "}"); - p = isl_printer_end_line(p); + p = ppcg_end_block(p); p = isl_printer_start_line(p); p = isl_printer_end_line(p); -#if 0 print_kernel(data->prog, kernel, data->cuda); -#endif return p; } @@ -618,30 +672,12 @@ print_options = isl_ast_print_options_set_print_user(print_options, &print_host_user, &data); - p = ppcg_print_macros(p, tree); + p = gpu_print_macros(p, tree); p = isl_ast_node_print(tree, p, print_options); return p; } -static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p, - struct gpu_prog *prog) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - if (!gpu_array_requires_device_allocation(&prog->array[i])) - continue; - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_"); - p = isl_printer_print_str(p, prog->array[i].name); - p = isl_printer_print_str(p, "));"); - p = isl_printer_end_line(p); - } - - return p; -} - /* Given a gpu_prog "prog" and the corresponding transformed AST * "tree", print the entire CUDA code to "p". * "types" collects the types for which a definition has already @@ -662,20 +698,8 @@ if (!kernel) return isl_printer_free(p); - p = ppcg_start_block(p); - - p = print_cuda_macros(p); - - p = gpu_print_local_declarations(p, prog); - p = declare_device_arrays(p, prog); - p = allocate_device_arrays(p, prog); - p = print_host_code(p, prog, tree, cuda); - p = free_device_arrays(p, prog); - - p = ppcg_end_block(p); - return p; } Index: polly/trunk/lib/External/ppcg/gpu.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu.h +++ polly/trunk/lib/External/ppcg/gpu.h @@ -2,11 +2,59 @@ #define _GPU_H #include +#include #include +#include + #include "ppcg.h" #include "ppcg_options.h" +/* An access to an outer array element or an iterator. + * Accesses to iterators have an access relation that maps to an unnamed space. + * An access may be both read and write. + * If the access relation is empty, then the output dimension may + * not be equal to the dimension of the corresponding array. + */ +struct gpu_stmt_access { + /* Access reads elements */ + int read; + /* Access writes elements */ + int write; + /* All writes are definite writes. */ + int exact_write; + /* Is a single, fixed element being accessed? */ + isl_bool fixed_element; + /* The number of index expressions specified in the access. */ + int n_index; + + /* May access relation */ + isl_map *access; + /* May access relation with as domain a mapping from iteration domain + * to a reference identifier. + */ + isl_map *tagged_access; + /* The reference id of the corresponding pet_expr. */ + isl_id *ref_id; + + struct gpu_stmt_access *next; +}; + +/* A representation of a user statement. + * "stmt" points to the corresponding pet statement. + * "id" is the identifier of the instance set of the statement. + * "accesses" is a linked list of accesses performed by the statement. + * If the statement has been killed, i.e., if it will not be scheduled, + * then this linked list may be empty even if the actual statement does + * perform accesses. + */ +struct gpu_stmt { + isl_id *id; + struct pet_stmt *stmt; + + struct gpu_stmt_access *accesses; +}; + /* Represents an outer array possibly accessed by a gpu_prog. */ struct gpu_array_info { @@ -18,12 +66,20 @@ int size; /* Name of the array. */ char *name; + /* Declared extent of original array. */ + isl_set *declared_extent; + /* AST expression for declared size of original array. */ + isl_ast_expr *declared_size; /* Extent of the array that needs to be copied. */ isl_set *extent; /* Number of indices. */ unsigned n_index; /* For each index, a bound on "extent" in that direction. */ - isl_pw_aff **bound; + isl_multi_pw_aff *bound; + /* The corresponding access AST expression, if the array needs + * to be allocated on the device. + */ + isl_ast_expr *bound_expr; /* All references to this array; point to elements of a linked list. */ int n_ref; @@ -38,6 +94,9 @@ /* Are the elements of the array structures? */ int has_compound_element; + /* Are the elements only accessed through constant index expressions? */ + int only_fixed_element; + /* Is the array local to the scop? */ int local; /* Is the array local and should it be declared on the host? */ @@ -54,8 +113,6 @@ * It is set to NULL otherwise. */ isl_union_map *dep_order; - - void *user; }; /* Represents an outer array accessed by a ppcg_kernel, localized @@ -67,8 +124,8 @@ * must be mapped to a register. * "global" is set if the global device memory corresponding * to this array is accessed by the kernel. - * For each index i with 0 <= i < n_index, - * bound[i] is equal to array->bound[i] specialized to the current kernel. + * "bound" is equal to array->bound specialized to the current kernel. + * "bound_expr" is the corresponding access AST expression. */ struct gpu_local_array_info { struct gpu_array_info *array; @@ -80,7 +137,8 @@ int global; unsigned n_index; - isl_pw_aff_list *bound; + isl_multi_pw_aff *bound; + isl_ast_expr *bound_expr; }; __isl_give isl_ast_expr *gpu_local_array_info_linearize_index( @@ -125,7 +183,7 @@ /* A mapping from the outer arrays to all corresponding inner arrays. */ isl_union_map *to_inner; /* A mapping from all intermediate arrays to their outer arrays, - * including an identity mapping from the anoymous 1D space to itself. + * including an identity mapping from the anonymous 1D space to itself. */ isl_union_map *any_to_outer; @@ -150,17 +208,6 @@ struct gpu_types *types, void *user); void *print_user; - isl_id_to_ast_expr *(*build_ast_expr)(void *stmt, - isl_ast_build *build, - isl_multi_pw_aff *(*fn_index)( - __isl_take isl_multi_pw_aff *mpa, isl_id *id, - void *user), - void *user_index, - isl_ast_expr *(*fn_expr)(isl_ast_expr *expr, - isl_id *id, void *user), - void *user_expr); - - struct gpu_prog *prog; /* The generated AST. */ isl_ast_node *tree; @@ -178,7 +225,7 @@ int kernel_id; }; -enum ppcg_kernel_access_type { +enum ppcg_group_access_type { ppcg_access_global, ppcg_access_shared, ppcg_access_private @@ -238,7 +285,7 @@ */ struct ppcg_kernel_var { struct gpu_array_info *array; - enum ppcg_kernel_access_type type; + enum ppcg_group_access_type type; char *name; isl_vec *size; }; @@ -262,6 +309,8 @@ * refers to the x dimension. * * grid_size reflects the effective grid size. + * grid_size_expr contains a corresponding access AST expression, built within + * the context where the launch appears. * * context contains the values of the parameters and outer schedule dimensions * for which any statement instance in this kernel needs to be executed. @@ -272,7 +321,14 @@ * core contains the spaces of the statement domains that form * the core computation of the kernel. It is used to navigate * the tree during the construction of the device part of the schedule - * tree in create_kernel. + * tree in gpu_create_kernel. + * + * expanded_domain contains the original statement instances, + * i.e., those that appear in the domains of access relations, + * that are involved in the kernel. + * contraction maps those original statement instances to + * the statement instances that are active at the point + * in the schedule tree where the kernel is created. * * arrays is the set of possibly accessed outer array elements. * @@ -297,10 +353,12 @@ * are represented by "n_block" parameters with as names the elements * of "thread_ids". * - * shared_schedule corresponds to the schedule dimensions of + * copy_schedule corresponds to the schedule dimensions of * the (tiled) schedule for this kernel that have been taken into account * for computing private/shared memory tiles. - * shared_schedule_dim is the dimension of this schedule. + * The domain corresponds to the original statement instances, i.e., + * those that appear in the leaves of the schedule tree. + * copy_schedule_dim is the dimension of this schedule. * * sync_writes contains write references that require synchronization. * Each reference is represented by a universe set in a space [S[i,j] -> R[]] @@ -323,12 +381,16 @@ int block_dim[3]; isl_multi_pw_aff *grid_size; + isl_ast_expr *grid_size_expr; isl_set *context; int n_sync; isl_union_set *core; isl_union_set *arrays; + isl_union_pw_multi_aff *contraction; + isl_union_set *expanded_domain; + isl_space *space; int n_array; @@ -341,8 +403,8 @@ isl_union_set *block_filter; isl_union_set *thread_filter; - isl_union_pw_multi_aff *shared_schedule; - int shared_schedule_dim; + isl_union_pw_multi_aff *copy_schedule; + int copy_schedule_dim; isl_union_set *sync_writes; @@ -353,6 +415,7 @@ int gpu_array_is_read_only_scalar(struct gpu_array_info *array); int gpu_array_requires_device_allocation(struct gpu_array_info *array); __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array); +isl_bool gpu_array_can_be_private(struct gpu_array_info *array); struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop); void *gpu_prog_free(struct gpu_prog *prog); @@ -365,13 +428,8 @@ struct gpu_prog *prog, __isl_keep isl_ast_node *tree, struct gpu_types *types, void *user), void *user); -__isl_give isl_schedule *get_schedule(struct gpu_gen *gen); -int has_any_permutable_node(__isl_keep isl_schedule *schedule); -__isl_give isl_schedule *map_to_device(struct gpu_gen *gen, - __isl_take isl_schedule *schedule); -__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, - __isl_take isl_schedule *schedule); +__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, int scale, + __isl_keep isl_multi_val *sizes); -__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog); -void collect_references(struct gpu_prog *prog, struct gpu_array_info *array); #endif Index: polly/trunk/lib/External/ppcg/gpu.c =================================================================== --- polly/trunk/lib/External/ppcg/gpu.c +++ polly/trunk/lib/External/ppcg/gpu.c @@ -1,6 +1,7 @@ /* * Copyright 2010-2011 INRIA Saclay * Copyright 2012-2013 Ecole Normale Superieure + * Copyright 2015-2016 Sven Verdoolaege * * Use of this software is governed by the MIT license * @@ -28,7 +29,9 @@ #include "gpu.h" #include "gpu_array_tile.h" #include "gpu_group.h" +#include "gpu_hybrid.h" #include "gpu_tree.h" +#include "hybrid.h" #include "schedule.h" #include "ppcg_options.h" #include "print.h" @@ -55,7 +58,7 @@ /* Collect all references to the given array and store pointers to them * in array->refs. */ -void collect_references(struct gpu_prog *prog, +static void collect_references(struct gpu_prog *prog, struct gpu_array_info *array) { int i; @@ -156,6 +159,20 @@ return empty; } +/* Is "array" only accessed as individual, fixed elements? + * That is, does each access to "array" access a single, fixed element? + */ +static isl_bool only_fixed_element_accessed(struct gpu_array_info *array) +{ + int i; + + for (i = 0; i < array->n_ref; ++i) + if (!array->refs[i]->fixed_element) + return isl_bool_false; + + return isl_bool_true; +} + /* Compute bounds on the host array "pa" based on the corresponding * accessed elements in "arrays" * and collect all references to the array. @@ -169,22 +186,18 @@ struct gpu_array_info *info, struct pet_array *pa, __isl_keep isl_union_set *arrays) { - int i, empty; + int empty; const char *name; int n_index; - isl_pw_aff **bounds; + isl_multi_pw_aff *bounds; isl_set *accessed, *extent; n_index = isl_set_dim(pa->extent, isl_dim_set); name = isl_set_get_tuple_name(pa->extent); - bounds = isl_alloc_array(prog->ctx, isl_pw_aff *, n_index); - if (!bounds) - return -1; info->space = isl_set_get_space(pa->extent); info->name = strdup(name); info->n_index = n_index; - info->bound = bounds; info->linearize = prog->scop->options->linearize_device_arrays; info->type = strdup(pa->element_type); @@ -193,6 +206,7 @@ info->has_compound_element = pa->element_is_record; info->read_only_scalar = is_read_only_scalar(info, prog); + info->declared_extent = isl_set_copy(pa->extent); accessed = isl_union_set_extract_set(arrays, isl_space_copy(info->space)); empty = isl_set_is_empty(accessed); @@ -202,35 +216,16 @@ if (empty < 0) return -1; info->accessed = !empty; - for (i = 0; i < n_index; ++i) { - isl_set *dom; - isl_local_space *ls; - isl_aff *one; - isl_pw_aff *bound; - - dom = isl_set_copy(extent); - dom = isl_set_project_out(dom, isl_dim_set, i + 1, - n_index - (i + 1)); - dom = isl_set_project_out(dom, isl_dim_set, 0, i); - if (!isl_set_dim_has_upper_bound(dom, isl_dim_set, 0)) { - fprintf(stderr, "unable to determine extent of '%s' " - "in dimension %d\n", info->name, i); - dom = isl_set_free(dom); - } - bound = isl_set_dim_max(dom, 0); - dom = isl_pw_aff_domain(isl_pw_aff_copy(bound)); - ls = isl_local_space_from_space(isl_set_get_space(dom)); - one = isl_aff_zero_on_domain(ls); - one = isl_aff_add_constant_si(one, 1); - bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one)); - bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context)); - - bounds[i] = bound; - if (!isl_pw_aff_is_cst(bound)) - info->linearize = 1; - } + bounds = ppcg_size_from_extent(isl_set_copy(extent)); + bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context)); + if (!bounds) + return -1; + if (!isl_multi_pw_aff_is_cst(bounds)) + info->linearize = 1; + info->bound = bounds; collect_references(prog, info); + info->only_fixed_element = only_fixed_element_accessed(info); return 0; } @@ -238,7 +233,7 @@ /* Remove independence from the order constraints "order" on array "array". * Since the pairs of iterations in the filter relation of an independence * are guaranteed to be completely independent by the user, there is - * no need to ensure that live ranges are ordered along thong pairs. + * no need to ensure that live ranges are ordered along those pairs. * We make an exception for local variables, though, as the independence * guarantee does not apply to those. * @@ -277,7 +272,7 @@ * the same array, the target of these order dependences will also * be one of these references.) * Additionally, store the union of these array->dep_order relations - * for all non-scalar arrays in prog->array_order. + * for all arrays that cannot be mapped to private memory in prog->array_order. */ void collect_order_dependences(struct gpu_prog *prog) { @@ -313,7 +308,7 @@ order = remove_independences(prog, array, order); array->dep_order = order; - if (gpu_array_is_scalar(array) && !array->has_compound_element) + if (gpu_array_can_be_private(array)) continue; prog->array_order = isl_union_map_union(prog->array_order, @@ -330,6 +325,7 @@ * elements by "prog". * If there are any member accesses involved, then they are first mapped * to the outer arrays of structs. + * Only extract gpu_array_info entries for these outer arrays. * * If we are allowing live range reordering, then also set * the dep_order field. Otherwise leave it NULL. @@ -353,10 +349,21 @@ prog->array = isl_calloc_array(prog->ctx, struct gpu_array_info, prog->n_array); assert(prog->array); - for (i = 0; i < prog->scop->pet->n_array; ++i) - if (extract_array_info(prog, &prog->array[i], + prog->n_array = 0; + for (i = 0; i < prog->scop->pet->n_array; ++i) { + isl_bool field; + + field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent); + if (field < 0) + break; + if (field) + continue; + if (extract_array_info(prog, &prog->array[prog->n_array++], prog->scop->pet->arrays[i], arrays) < 0) r = -1; + } + if (i < prog->scop->pet->n_array) + r = -1; isl_union_set_free(arrays); @@ -368,17 +375,17 @@ static void free_array_info(struct gpu_prog *prog) { - int i, j; + int i; for (i = 0; i < prog->n_array; ++i) { - int n_index = prog->array[i].n_index; free(prog->array[i].type); free(prog->array[i].name); - for (j = 0; j < n_index; ++j) - isl_pw_aff_free(prog->array[i].bound[j]); + isl_multi_pw_aff_free(prog->array[i].bound); + isl_ast_expr_free(prog->array[i].bound_expr); isl_space_free(prog->array[i].space); + isl_set_free(prog->array[i].declared_extent); isl_set_free(prog->array[i].extent); - free(prog->array[i].bound); + isl_ast_expr_free(prog->array[i].declared_size); free(prog->array[i].refs); isl_union_map_free(prog->array[i].dep_order); } @@ -395,6 +402,17 @@ return array->n_index == 0; } +/* Can "array" be mapped to private memory? + * That is, is it only accessed as individual elements with + * constant index expressions? + */ +isl_bool gpu_array_can_be_private(struct gpu_array_info *array) +{ + if (!array) + return isl_bool_error; + return array->only_fixed_element; +} + /* Is "array" a read-only scalar? */ int gpu_array_is_read_only_scalar(struct gpu_array_info *array) @@ -438,7 +456,7 @@ isl_pw_aff *bound; isl_set *guard_i, *zero; - bound = isl_pw_aff_copy(array->bound[i]); + bound = isl_multi_pw_aff_get_pw_aff(array->bound, i); guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound)); zero = isl_pw_aff_zero_set(bound); guard_i = isl_set_subtract(guard_i, zero); @@ -804,7 +822,7 @@ aff = isl_aff_var_on_domain(isl_local_space_copy(ls), isl_dim_set, i); index = isl_pw_aff_from_aff(aff); - bound = isl_pw_aff_copy(array->bound[i]); + bound = isl_multi_pw_aff_get_pw_aff(array->bound, i); bound = isl_pw_aff_from_range(bound); bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index); bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in, @@ -818,8 +836,8 @@ return extent; } -/* Return a map from the first group->depth dimensions of the computed - * schedule to the array tile in +/* Return a map from the first group->shared_tile->depth dimensions + * of the computed schedule to the array tile in * global memory that corresponds to the shared memory copy. * * In particular, return a map @@ -873,15 +891,15 @@ /* Given a mapping "iterator_map" from the AST schedule to a domain, * return the corresponding mapping from the AST schedule to - * to the outer kernel->shared_schedule_dim dimensions of + * to the outer kernel->copy_schedule_dim dimensions of * the schedule computed by PPCG for this kernel. * - * Note that kernel->shared_schedule_dim is at least as large as + * Note that kernel->copy_schedule_dim is at least as large as * the largest depth of any array reference group associated to the kernel. * This is needed as the returned schedule is used to extract a mapping - * to the outer group->depth dimensions in transform_index. + * to the outer tile->depth dimensions in transform_index. */ -static __isl_give isl_pw_multi_aff *compute_sched_to_shared( +static __isl_give isl_pw_multi_aff *compute_sched_to_copy( struct ppcg_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map) { isl_union_pw_multi_aff *upma; @@ -891,9 +909,9 @@ space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map)); space = isl_space_from_domain(space); space = isl_space_add_dims(space, isl_dim_out, - kernel->shared_schedule_dim); + kernel->copy_schedule_dim); - upma = isl_union_pw_multi_aff_copy(kernel->shared_schedule); + upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule); pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space); isl_union_pw_multi_aff_free(upma); @@ -929,11 +947,11 @@ for (j = 0; j < local->n_group; ++j) { struct gpu_array_ref_group *group; + enum ppcg_group_access_type type; group = local->groups[j]; - if (group->private_tile) - continue; - if (!group->shared_tile) + type = gpu_array_ref_group_type(group); + if (type != ppcg_access_shared) continue; size = gpu_array_tile_size(group->shared_tile); @@ -991,41 +1009,6 @@ } } -/* Compute the size of a bounding box around the origin and "set", - * where "set" is assumed to contain only non-negative elements. - * In particular, compute the maximal value of "set" in each direction - * and add one. - */ -static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set, - __isl_take isl_set *context) -{ - int i, n; - isl_multi_pw_aff *mpa; - - context = isl_set_params(context); - n = isl_set_dim(set, isl_dim_set); - mpa = isl_multi_pw_aff_zero(isl_set_get_space(set)); - for (i = 0; i < n; ++i) { - isl_space *space; - isl_aff *one; - isl_pw_aff *bound; - - bound = isl_set_dim_max(isl_set_copy(set), i); - bound = isl_pw_aff_coalesce(bound); - bound = isl_pw_aff_gist(bound, isl_set_copy(context)); - - space = isl_pw_aff_get_domain_space(bound); - one = isl_aff_zero_on_domain(isl_local_space_from_space(space)); - one = isl_aff_add_constant_si(one, 1); - bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one)); - mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound); - } - isl_set_free(set); - isl_set_free(context); - - return mpa; -} - /* Compute the effective grid size as a list of the sizes in each dimension. * * The grid size specified by the user or set by default @@ -1050,6 +1033,8 @@ { int i; isl_set *grid; + isl_set *context; + isl_multi_pw_aff *size; domain = isl_union_set_intersect(domain, isl_union_set_copy(kernel->block_filter)); @@ -1068,7 +1053,10 @@ grid = isl_set_project_out(grid, isl_dim_param, pos, 1); } - return extract_size(grid, isl_set_copy(kernel->context)); + grid = isl_set_coalesce(grid); + size = ppcg_size_from_extent(grid); + context = isl_set_params(isl_set_copy(kernel->context)); + return isl_multi_pw_aff_gist(size, context); } /* Compute the size of a fixed bounding box around the origin and "set", @@ -1110,7 +1098,7 @@ * to the smallest block size that ensures that all threads * that actually execute code are included in the block. * - * The possible values of the thread ids is obtained from + * The set of possible values of the thread ids is obtained from * the domain elements "domain" and kernel->thread_filter. * The current implementation eliminates all parameters, ensuring * that the size is a fixed constant in each dimension. @@ -1118,7 +1106,7 @@ * We would have to make sure to project out all b%d and t%d parameters, * however. */ -static void extract_block_size(struct ppcg_kernel *kernel, +static isl_stat extract_block_size(struct ppcg_kernel *kernel, __isl_take isl_union_set *domain) { int i; @@ -1134,17 +1122,28 @@ int pos; isl_id *id; + if (!block) + return isl_stat_error; + id = isl_id_list_get_id(kernel->thread_ids, i); pos = isl_set_find_dim_by_id(block, isl_dim_param, id); isl_id_free(id); - assert(pos >= 0); + if (pos < 0) + isl_die(isl_set_get_ctx(block), isl_error_internal, + "missing constraints on thread identifier", + block = isl_set_free(block)); block = isl_set_equate(block, isl_dim_param, pos, isl_dim_set, i); } nparam = isl_set_dim(block, isl_dim_param); block = isl_set_project_out(block, isl_dim_param, 0, nparam); + if (!block) + return isl_stat_error; + extract_fixed_size(block, kernel->block_dim); + + return isl_stat_ok; } struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel) @@ -1157,14 +1156,17 @@ isl_id_list_free(kernel->block_ids); isl_id_list_free(kernel->thread_ids); isl_multi_pw_aff_free(kernel->grid_size); + isl_ast_expr_free(kernel->grid_size_expr); isl_set_free(kernel->context); isl_union_set_free(kernel->core); isl_union_set_free(kernel->arrays); + isl_union_pw_multi_aff_free(kernel->contraction); + isl_union_set_free(kernel->expanded_domain); isl_space_free(kernel->space); isl_ast_node_free(kernel->tree); isl_union_set_free(kernel->block_filter); isl_union_set_free(kernel->thread_filter); - isl_union_pw_multi_aff_free(kernel->shared_schedule); + isl_union_pw_multi_aff_free(kernel->copy_schedule); isl_union_set_free(kernel->sync_writes); for (i = 0; i < kernel->n_array; ++i) { @@ -1174,7 +1176,8 @@ gpu_array_ref_group_free(array->groups[j]); free(array->groups); - isl_pw_aff_list_free(array->bound); + isl_multi_pw_aff_free(array->bound); + isl_ast_expr_free(array->bound_expr); } free(kernel->array); @@ -1204,16 +1207,11 @@ int j; struct gpu_array_tile *tile; isl_printer *p; - char *name; var->array = group->array; - tile = group->private_tile; - var->type = ppcg_access_private; - if (!tile) { - tile = group->shared_tile; - var->type = ppcg_access_shared; - } + var->type = gpu_array_ref_group_type(group); + tile = gpu_array_ref_group_tile(group); p = isl_printer_to_str(ctx); p = gpu_array_ref_group_print_name(group, p); @@ -1237,7 +1235,10 @@ for (j = 0; j < array->n_group; ++j) { struct gpu_array_ref_group *group = array->groups[j]; - if (group->private_tile || group->shared_tile) + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type != ppcg_access_global) ++n; } } @@ -1253,7 +1254,10 @@ for (j = 0; j < array->n_group; ++j) { struct gpu_array_ref_group *group = array->groups[j]; - if (!group->private_tile && !group->shared_tile) + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_global) continue; create_kernel_var(kernel->ctx, group, &kernel->var[n]); ++n; @@ -1304,27 +1308,27 @@ for (i = 0; i < kernel->n_array; ++i) { struct gpu_local_array_info *local = &kernel->array[i]; - isl_pw_aff_list *bound; + isl_multi_pw_aff *bound; int n_index; if (local->n_group == 0) continue; n_index = local->array->n_index; - bound = isl_pw_aff_list_alloc(kernel->ctx, n_index); + bound = isl_multi_pw_aff_copy(local->array->bound); for (j = 0; j < n_index; ++j) { isl_pw_aff *pwaff; int empty; - pwaff = isl_pw_aff_copy(local->array->bound[j]); + pwaff = isl_multi_pw_aff_get_pw_aff(bound, j); pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context)); empty = isl_pw_aff_is_empty(pwaff); if (empty < 0) pwaff = isl_pw_aff_free(pwaff); else if (empty) pwaff = set_universally_zero(pwaff); - bound = isl_pw_aff_list_add(bound, pwaff); + bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff); } local->n_index = n_index; @@ -1384,7 +1388,6 @@ void ppcg_kernel_stmt_free(void *user) { - int i; struct ppcg_kernel_stmt *stmt = user; if (!stmt) @@ -1441,7 +1444,7 @@ * "accesses" is the list of gpu_stmt_access in the statement. * "iterator_map" expresses the statement iterators in terms of * the AST loop iterators. - * "sched2shared" expresses the outer shared_schedule_dim dimensions of + * "sched2copy" expresses the outer copy_schedule_dim dimensions of * the kernel schedule in terms of the AST loop iterators and * may be NULL if we are not inside a kernel. * @@ -1453,11 +1456,10 @@ * to the current kernel. */ struct ppcg_transform_data { - struct ppcg_options *options; struct ppcg_kernel *kernel; struct gpu_stmt_access *accesses; isl_pw_multi_aff *iterator_map; - isl_pw_multi_aff *sched2shared; + isl_pw_multi_aff *sched2copy; struct gpu_array_info *array; int global; @@ -1484,6 +1486,66 @@ return NULL; } +/* Given an index expression "index" of the form + * + * L -> F(A), + * + * with F(A) either A or some subfield of A and L the AST loop iterators, + * and a tiling "tiling" of the form + * + * [L -> A] -> T + * + * apply the tiling to the outer array in the index expression to obtain + * + * L -> T(A) + * + * If F(A) is some subfield of A, then separate the member access + * into the base index expression and the field index expression, + * apply the tiling to the base index expression and combine the result + * with the field index expression. + * + * If F(A) is A, then modify index to keep track of the iterators + * + * L -> [L -> A] + * + * and combine the result with the tiling to obtain a tiled index expression + * in terms of the AST loop iterators + * + * L -> T + */ +static __isl_give isl_multi_pw_aff *tile_outer( + __isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling) +{ + isl_bool is_wrapping; + isl_space *space; + isl_multi_pw_aff *mpa; + + is_wrapping = isl_multi_pw_aff_range_is_wrapping(index); + if (is_wrapping < 0) + goto error; + if (is_wrapping) { + isl_multi_pw_aff *field; + + field = isl_multi_pw_aff_copy(index); + field = isl_multi_pw_aff_range_factor_range(field); + index = isl_multi_pw_aff_range_factor_domain(index); + index = tile_outer(index, tiling); + return isl_multi_pw_aff_range_product(index, field); + } + + space = isl_space_domain(isl_multi_pw_aff_get_space(index)); + space = isl_space_map_from_set(space); + mpa = isl_multi_pw_aff_identity(space); + index = isl_multi_pw_aff_range_product(mpa, index); + index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index); + + return index; +error: + isl_multi_pw_aff_free(index); + isl_multi_pw_aff_free(tiling); + return NULL; +} + /* Index transformation callback for pet_stmt_build_ast_exprs. * * "index" expresses the array indices in terms of statement iterators @@ -1504,7 +1566,7 @@ * * [D -> A] -> T * - * where D corresponds to the outer group->depth dimensions of + * where D corresponds to the outer tile->depth dimensions of * the kernel schedule. * The index is of the form * @@ -1514,14 +1576,16 @@ * * [L -> A] -> T * - * and modify index to keep track of those iterators - * - * L -> [L -> A] - * - * Combining these two yields a tiled index expression in terms + * and combine it with the index to obtain a tiled index expression in terms * of the AST loop iterators * * L -> T + * + * Note that while the tiling applies directly to an outer array. + * the index may refer to some subfield of this outer array. + * In such cases, the result will refer to the same subfield of the tile. + * That is, an index expression of the form L -> F(A) will be transformed + * into an index expression of the form L -> F(T). */ static __isl_give isl_multi_pw_aff *transform_index( __isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id, @@ -1538,7 +1602,6 @@ isl_space *space; isl_multi_pw_aff *tiling; isl_pw_multi_aff *pma; - isl_multi_pw_aff *mpa; isl_pw_multi_aff *sched2depth; data->array = NULL; @@ -1570,30 +1633,25 @@ return index; } - tile = group->private_tile; - if (!tile) - tile = group->shared_tile; + tile = gpu_array_ref_group_tile(group); data->global = !tile; if (!tile) return index; - space = isl_space_range(isl_multi_pw_aff_get_space(index)); + space = isl_space_domain(isl_multi_aff_get_space(tile->tiling)); + space = isl_space_range(isl_space_unwrap(space)); space = isl_space_map_from_set(space); pma = isl_pw_multi_aff_identity(space); - sched2depth = isl_pw_multi_aff_copy(data->sched2shared); + sched2depth = isl_pw_multi_aff_copy(data->sched2copy); dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out); sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out, - group->depth, dim - group->depth); + tile->depth, dim - tile->depth); pma = isl_pw_multi_aff_product(sched2depth, pma); tiling = isl_multi_pw_aff_from_multi_aff( isl_multi_aff_copy(tile->tiling)); tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma); - space = isl_space_domain(isl_multi_pw_aff_get_space(index)); - space = isl_space_map_from_set(space); - mpa = isl_multi_pw_aff_identity(space); - index = isl_multi_pw_aff_range_product(mpa, index); - index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index); + index = tile_outer(index, tiling); return index; } @@ -1666,21 +1724,18 @@ * element while the default linearized expression would refer to * a single element, we return the expression * - * A + (..((i_0 * b_1 + i_1) ... ) * b_n] + * A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l) * * instead. Note that because of the special case handling above, - * we can assume here that here that there is at least one index expression. + * we can assume here that there is at least one index expression. */ __isl_give isl_ast_expr *gpu_local_array_info_linearize_index( struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr) { int i, n; - isl_ctx *ctx; - isl_set *context; isl_ast_expr *arg0; isl_ast_expr *res; isl_ast_expr_list *list; - isl_ast_build *build; arg0 = isl_ast_expr_get_op_arg(expr, 0); if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op && @@ -1699,18 +1754,12 @@ if (isl_ast_expr_get_op_n_arg(expr) == 1) return expr; - ctx = isl_ast_expr_get_ctx(expr); - context = isl_set_universe(isl_space_params_alloc(ctx, 0)); - build = isl_ast_build_from_context(context); - n = isl_ast_expr_get_op_n_arg(expr); res = isl_ast_expr_get_op_arg(expr, 1); for (i = 1; i < array->n_index; ++i) { - isl_pw_aff *bound_i; isl_ast_expr *expr_i; - bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i); - expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i); + expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i); res = isl_ast_expr_mul(res, expr_i); if (i + 1 >= n) @@ -1719,8 +1768,6 @@ res = isl_ast_expr_add(res, expr_i); } - isl_ast_build_free(build); - if (1 + array->n_index > n) { res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res); } else { @@ -1782,20 +1829,19 @@ * with name "user". * These AST expressions are computed from iterator_map, * which expresses the domain - * elements in terms of the generated loops, and sched2shared, - * which expresses the outer shared_schedule_dim dimensions of + * elements in terms of the generated loops, and sched2copy, + * which expresses the outer copy_schedule_dim dimensions of * the kernel schedule computed by PPCG in terms of the generated loops. */ static __isl_give isl_ast_node *create_domain_leaf( struct ppcg_kernel *kernel, __isl_take isl_ast_node *node, - __isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt, - struct gpu_gen *gen) + __isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt) { struct ppcg_transform_data data; struct ppcg_kernel_stmt *stmt; isl_ctx *ctx; isl_id *id; - isl_pw_multi_aff *sched2shared; + isl_pw_multi_aff *sched2copy; isl_map *map; isl_pw_multi_aff *iterator_map; isl_union_map *schedule; @@ -1812,10 +1858,10 @@ map = isl_map_reverse(isl_map_from_union_map(schedule)); iterator_map = isl_pw_multi_aff_from_map(map); if (kernel) - sched2shared = compute_sched_to_shared(kernel, + sched2copy = compute_sched_to_copy(kernel, isl_pw_multi_aff_copy(iterator_map)); else - sched2shared = NULL; + sched2copy = NULL; stmt->type = ppcg_kernel_domain; stmt->u.d.stmt = gpu_stmt; @@ -1823,12 +1869,13 @@ data.kernel = kernel; data.accesses = stmt->u.d.stmt->accesses; data.iterator_map = iterator_map; - data.sched2shared = sched2shared; - stmt->u.d.ref2expr = gen->build_ast_expr(stmt->u.d.stmt->stmt, + data.sched2copy = sched2copy; + stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt, build, &transform_index, &data, &transform_expr, &data); + isl_pw_multi_aff_free(iterator_map); - isl_pw_multi_aff_free(sched2shared); + isl_pw_multi_aff_free(sched2copy); id = isl_id_alloc(ctx, "user", stmt); id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); @@ -1846,7 +1893,7 @@ * * type[D -> A] -> L * - * where D corresponds to the outer group->depth dimensions of + * where D corresponds to the outer tile->depth dimensions of * the kernel schedule, A to the global array and L to the outer * generated AST schedule. * We compute the inverse and strip off the type, resulting in @@ -1867,6 +1914,7 @@ * * and store the corresponding expressions in stmt->index and stmt->local_index, * where stmt points to the ppcg_kernel_stmt that is attached to the node. + * stmt->index is linearized if the global memory array is linearized. */ static __isl_give isl_ast_node *create_access_leaf(struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, __isl_take isl_ast_node *node, @@ -1898,6 +1946,9 @@ pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2, isl_pw_multi_aff_copy(pma)); expr = isl_ast_build_access_from_pw_multi_aff(build, pma2); + if (group->array->linearize) + expr = gpu_local_array_info_linearize_index(group->local_array, + expr); stmt->u.c.index = expr; tile = gpu_array_ref_group_tile(group); @@ -1911,7 +1962,7 @@ stmt->u.c.local_array = group->local_array; stmt->type = ppcg_kernel_copy; - id = isl_id_alloc(kernel->ctx, NULL, stmt); + id = isl_id_alloc(kernel->ctx, "copy", stmt); id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); return isl_ast_node_set_annotation(node, id); } @@ -1931,11 +1982,57 @@ return isl_ast_node_free(node); stmt->type = ppcg_kernel_sync; - id = isl_id_alloc(kernel->ctx, NULL, stmt); + id = isl_id_alloc(kernel->ctx, "sync", stmt); id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free); return isl_ast_node_set_annotation(node, id); } +/* Build AST expressions for the device array sizes of all arrays in "prog" + * that require allocation on the device using "build", as well as + * for the original array sizes of all arrays that need to be declared + * on the host. + * "node" is freed in case of error. + */ +static __isl_give isl_ast_node *build_array_bounds( + __isl_take isl_ast_node *node, struct gpu_prog *prog, + __isl_keep isl_ast_build *build) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_multi_pw_aff *size; + isl_ast_expr *expr; + + if (!gpu_array_requires_device_allocation(array)) + continue; + + size = isl_multi_pw_aff_copy(array->bound); + expr = ppcg_build_size_expr(size, build); + array->bound_expr = expr; + if (!expr) + return isl_ast_node_free(node); + } + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + isl_set *extent; + isl_multi_pw_aff *size; + isl_ast_expr *expr; + + if (!array->declare_local) + continue; + extent = isl_set_copy(array->declared_extent); + size = ppcg_size_from_extent(extent); + expr = ppcg_build_size_expr(size, build); + array->declared_size = expr; + if (!expr) + return isl_ast_node_free(node); + } + + return node; +} + /* Internal data structure for at_domain. * * "prog" represents the entire scop. @@ -1945,7 +2042,6 @@ */ struct ppcg_at_domain_data { struct gpu_prog *prog; - struct gpu_gen *gen; struct ppcg_kernel *kernel; }; @@ -1959,9 +2055,11 @@ * requires special handling. * * If the user statement is one of the original user statements, then we call - * create_domain_leaf. Otherwise, we check if it is a copy or synchronization + * create_domain_leaf. If it is "init_device", then we call + * build_array_bounds. Otherwise, we check if it is a copy or synchronization * statement and call the appropriate functions. Statements that copy an array * to/from the device do not need any further treatment. + * Neither does "clear_device". */ static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build, void *user) @@ -1987,11 +2085,14 @@ isl_id_free(id); if (gpu_stmt) - return create_domain_leaf(data->kernel, node, build, gpu_stmt, - data->gen); + return create_domain_leaf(data->kernel, node, build, gpu_stmt); if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_")) return node; + if (!strcmp(name, "init_device")) + return build_array_bounds(node, data->prog, build); + if (!strcmp(name, "clear_device")) + return node; if (is_sync < 0) return isl_ast_node_free(node); if (!strcmp(name, "read") || !strcmp(name, "write")) { @@ -2049,6 +2150,8 @@ * remove those reads if ("read" is 1) or writes (if "read" is 0) * that are only needed to communicate data within * the same iteration of "sched". + * The domain of "sched" corresponds to the original statement instances, + * i.e., those that appear in the domains of the access relations. * "tagged" contains all tagged access relations to all * the array reference groups accessed by "access" from statement * instances scheduled by "sched". @@ -2189,17 +2292,19 @@ /* Given an access relation "access" from "group", remove those reads * if ("read" is 1) or writes (if "read" is 0) that are only needed to - * communicate data within the same iteration of the schedule at the - * position where the copying of the group is inserted. - * "node" points to this position, i.e., the depth at "node" - * is equal to group->depth. + * communicate data within the same iteration of the schedule "prefix" + * at the position where the copying of the group is inserted. + * That is, the output dimension of "prefix" + * is equal to tile->depth. + * The domain of "prefix" corresponds to the original statement instances, + * i.e., those that appear in the domains of the access relations. * - * We extract a schedule that picks out the iterations of the outer - * group->depth dimensions and call remove_local_accesses. + * Extract the tagged access relation of "group" and + * then call remove_local_accesses. */ static __isl_give isl_union_map *remove_local_accesses_group( struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, - __isl_take isl_union_map *access, __isl_keep isl_schedule_node *node, + __isl_take isl_union_map *access, __isl_keep isl_union_map *prefix, int read) { isl_union_map *sched, *tagged; @@ -2208,27 +2313,85 @@ return access; tagged = group_tagged_access_relation(group); - sched = isl_schedule_node_get_prefix_schedule_relation(node); + sched = isl_union_map_copy(prefix); return remove_local_accesses(kernel->prog, tagged, access, sched, read); } +/* Build an access AST expression for the effective grid size using "build". + * Store the result in kernel->grid_size_expr. + */ +static isl_stat build_grid_size(struct ppcg_kernel *kernel, + __isl_keep isl_ast_build *build) +{ + isl_multi_pw_aff *size; + + size = isl_multi_pw_aff_copy(kernel->grid_size); + size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid"); + kernel->grid_size_expr = ppcg_build_size_expr(size, build); + + if (!kernel->grid_size_expr) + return isl_stat_error; + return isl_stat_ok; +} + +/* Build access AST expressions for the localized array sizes using "build". + * Store the result in local->bound_expr. + * Only do this for arrays for which localized bounds have been computed. + */ +static isl_stat build_local_array_sizes(struct ppcg_kernel *kernel, + __isl_keep isl_ast_build *build) +{ + int i; + + for (i = 0; i < kernel->n_array; ++i) { + struct gpu_local_array_info *local = &kernel->array[i]; + isl_multi_pw_aff *size; + + if (local->n_group == 0) + continue; + size = isl_multi_pw_aff_copy(local->bound); + local->bound_expr = ppcg_build_size_expr(size, build); + if (!local->bound_expr) + return isl_stat_error; + } + + return isl_stat_ok; +} + +/* Build access AST expressions for the effective grid size and + * the localized array sizes using "build". + */ +static isl_stat build_grid_and_local_array_sizes(struct ppcg_kernel *kernel, + __isl_keep isl_ast_build *build) +{ + if (build_grid_size(kernel, build) < 0) + return isl_stat_error; + if (build_local_array_sizes(kernel, build) < 0) + return isl_stat_error; + return isl_stat_ok; +} + /* This function is called before the AST generator starts traversing * the schedule subtree of a node with mark "mark". * * If the mark is called "kernel", store the kernel pointer in data->kernel - * for use in at_domain. + * for use in at_domain and build AST expressions for the grid size and + * the localized array sizes. */ -static int before_mark(__isl_keep isl_id *mark, +static isl_stat before_mark(__isl_keep isl_id *mark, __isl_keep isl_ast_build *build, void *user) { struct ppcg_at_domain_data *data = user; if (!mark) - return -1; - if (!strcmp(isl_id_get_name(mark), "kernel")) + return isl_stat_error; + if (!strcmp(isl_id_get_name(mark), "kernel")) { data->kernel = isl_id_get_user(mark); - return 0; + if (build_grid_and_local_array_sizes(data->kernel, build) < 0) + return isl_stat_error; + } + return isl_stat_ok; } /* This function is called after the AST generator has finished traversing @@ -2297,7 +2460,7 @@ * The ASTs for the device code are embedded in ppcg_kernel objects * attached to the leaf nodes that call "kernel". */ -__isl_give isl_ast_node *generate_code(struct gpu_gen *gen, +static __isl_give isl_ast_node *generate_code(struct gpu_gen *gen, __isl_take isl_schedule *schedule) { struct ppcg_at_domain_data data; @@ -2307,7 +2470,6 @@ int depth; data.prog = gen->prog; - data.gen = gen; data.kernel = NULL; depth = 0; @@ -2376,42 +2538,73 @@ return isl_bool_error; } +/* Does the subtree rooted at "node" have any suitably permutable band nodes? + * That is, does it have any nodes that are permutable and that + * have a least one coincident dimension? + */ +static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node) +{ + int any_parallelism = 0; + + if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable, + &any_parallelism) < 0 && + !any_parallelism) + return -1; + + return any_parallelism; +} + /* Does "schedule" contain any permutable band with at least one coincident * member? */ -int has_any_permutable_node(__isl_keep isl_schedule *schedule) +static int has_any_permutable_node(__isl_keep isl_schedule *schedule) { - int any_permutable = 0; + isl_schedule_node *root; + int any_permutable; - if (isl_schedule_foreach_schedule_node_top_down(schedule, - &set_permutable, &any_permutable) < 0 && - !any_permutable) - return -1; + root = isl_schedule_get_root(schedule); + any_permutable = subtree_has_permutable_bands(root); + isl_schedule_node_free(root); return any_permutable; } -/* Is "node" a leaf or can it be tiled and then mapped to - * block and thread identifiers? +/* Is "node" a candidate for mapping to block and thread identifiers? + * In particular, is it permutable with at least one coincident dimension? + * Alternatively, does the subtree rooted at "node" not contain + * any such permutable node? Filter nodes are skipped in this case, + * because a band node will be inserted in front of the returned + * node and this is not possible for filter nodes that are children + * of set or sequence nodes. */ -static int is_leaf_or_tilable(__isl_keep isl_schedule_node *node) +static int is_candidate(__isl_keep isl_schedule_node *node) { + int permutable; + if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf) return 1; - return is_permutable(node); + permutable = is_permutable(node); + if (permutable < 0 || permutable) + return permutable; + if (isl_schedule_node_get_type(node) == isl_schedule_node_filter) + return 0; + permutable = subtree_has_permutable_bands(node); + if (permutable < 0) + return -1; + return !permutable; } /* Is "node" the outermost node in its branch that can be tiled * and then mapped to block and thread identifiers? - * If there are no such nodes in the branch and if "node" is a leaf, - * then it is accepted too. + * If there are no such nodes in the subtree at "node" and + * if "node" is not a filter node, then it is accepted too. */ static int is_outer_tilable(__isl_keep isl_schedule_node *node) { int tilable; isl_schedule_node *ancestor; - tilable = is_leaf_or_tilable(node); + tilable = is_candidate(node); if (tilable < 0) return -1; if (!tilable) @@ -2422,7 +2615,7 @@ while (isl_schedule_node_has_parent(ancestor)) { ancestor = isl_schedule_node_parent(ancestor); - tilable = is_permutable(ancestor); + tilable = is_candidate(ancestor); if (tilable < 0 || tilable) break; } @@ -2510,11 +2703,13 @@ for (j = 0; j < array->n_group; ++j) { struct gpu_array_ref_group *group = array->groups[j]; + enum ppcg_group_access_type type; isl_union_set *writes_ij; if (!group->write) continue; - if (group->private_tile) + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_private) continue; writes_ij = group_tagged_writes(group); writes = isl_union_set_union(writes, writes_ij); @@ -2551,26 +2746,13 @@ static __isl_give isl_multi_val *construct_band_tiles_sizes( __isl_keep isl_schedule_node *node, int *tile_size) { - int i, n; - isl_ctx *ctx; isl_space *space; - isl_multi_val *mv; if (!node) return NULL; - ctx = isl_schedule_node_get_ctx(node); space = isl_schedule_node_band_get_space(node); - n = isl_schedule_node_band_n_member(node); - mv = isl_multi_val_zero(space); - for (i = 0; i < n; ++i) { - isl_val *v; - - v = isl_val_int_from_si(ctx, tile_size[i]); - mv = isl_multi_val_set_val(mv, i, v); - } - - return mv; + return ppcg_multi_val_from_int_list(space, tile_size); } /* Replace the partial schedule S of the band node "node" by @@ -2671,7 +2853,9 @@ } /* Return the set of outer array elements accessed by - * by the statement instance in "domain" in "prog". + * by the statement instances in "domain" in "prog". + * The instances in "domain" are those that appear + * in the domains of the access relations in "prog". */ static __isl_give isl_union_set *accessed_by_domain( __isl_take isl_union_set *domain, struct gpu_prog *prog) @@ -2940,12 +3124,7 @@ */ static __isl_give isl_schedule_node *unroll(__isl_take isl_schedule_node *node) { - int i, n; - - n = isl_schedule_node_band_n_member(node); - for (i = 0; i < n; ++i) - node = isl_schedule_node_band_member_set_ast_loop_type(node, i, - isl_ast_loop_unroll); + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); node = isl_schedule_node_band_sink(node); @@ -2966,11 +3145,14 @@ * may have a different mapping from between shared memory elements and * threads, such that synchronization is required after the core. * "node" is assumed to point to the kernel node. + * + * If the shared and the thread mark point to the same node, then make + * sure the synchronization is inserted outside of the shared mark. */ static __isl_give isl_schedule_node *add_sync(struct ppcg_kernel *kernel, __isl_take isl_schedule_node *node) { - int kernel_depth; + int depth; int need_sync; need_sync = any_global_or_shared_sync_writes(kernel); @@ -2979,12 +3161,13 @@ if (!need_sync) return node; - kernel_depth = isl_schedule_node_get_schedule_depth(node); - node = gpu_tree_move_down_to_thread(node, kernel->core); - if (kernel_depth == isl_schedule_node_get_schedule_depth(node)) - return gpu_tree_move_up_to_kernel(node); + depth = isl_schedule_node_get_schedule_depth(node); + node = gpu_tree_move_up_to_kernel(node); + if (depth == isl_schedule_node_get_schedule_depth(node)) + return node; + node = gpu_tree_move_down_to_depth(node, depth, kernel->core); node = gpu_tree_ensure_following_sync(node, kernel); node = gpu_tree_move_up_to_kernel(node); @@ -3009,9 +3192,12 @@ isl_union_map *access; isl_union_map *prefix; - access = gpu_array_ref_group_access_relation(group, read, !read); - access = remove_local_accesses_group(kernel, group, access, node, read); prefix = isl_schedule_node_get_prefix_schedule_relation(node); + prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, + isl_union_pw_multi_aff_copy(kernel->contraction)); + access = gpu_array_ref_group_access_relation(group, read, !read); + access = remove_local_accesses_group(kernel, group, access, prefix, + read); access = isl_union_map_range_product(prefix, access); return access; @@ -3026,18 +3212,20 @@ * write[D -> A] -> [D -> A] * * if "read" is not set. - * D corresponds to the outer group->depth dimensions of + * D corresponds to the outer tile->depth dimensions of * the kernel schedule. */ static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx, struct gpu_array_ref_group *group, int read) { + struct gpu_array_tile *tile; isl_space *space; isl_id *id; + tile = gpu_array_ref_group_tile(group); space = isl_space_copy(group->array->space); space = isl_space_from_range(space); - space = isl_space_add_dims(space, isl_dim_in, group->depth); + space = isl_space_add_dims(space, isl_dim_in, tile->depth); space = isl_space_wrap(space); space = isl_space_map_from_set(space); @@ -3074,9 +3262,12 @@ node = isl_schedule_node_child(node, 0); node = gpu_tree_ensure_following_sync(node, kernel); } else if (shared) { + struct gpu_array_tile *tile; + + tile = gpu_array_ref_group_tile(group); node = isl_schedule_node_parent(node); node = isl_schedule_node_parent(node); - node = gpu_tree_move_down_to_depth(node, group->depth, + node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); node = gpu_tree_move_left_to_sync(node, kernel); } @@ -3094,14 +3285,14 @@ * * The copies are performed in the order of the array elements. * The copy statement instances include a reference to the outer - * group->depth dimensions of the kernel schedule for ease of + * tile->depth dimensions of the kernel schedule for ease of * combining them with the group tiling. * * That is, the extra schedule is of the form * * type[D -> A] -> A * - * where D corresponds to the outer group->depth dimensions of + * where D corresponds to the outer tile->depth dimensions of * the kernel schedule and A to the global array. * This schedule is unrolled because registers are not addressable. * @@ -3133,20 +3324,22 @@ struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, __isl_take isl_schedule_node *node, int read) { + struct gpu_array_tile *tile; isl_union_map *access; - isl_union_map *prefix; isl_union_set *domain; isl_space *space; isl_multi_aff *from_access; isl_multi_pw_aff *mpa; isl_multi_union_pw_aff *mupa; + isl_union_pw_multi_aff *contraction; isl_schedule_node *graft; isl_union_set *filter; int kernel_depth; int empty; kernel_depth = isl_schedule_node_get_schedule_depth(node); - node = gpu_tree_move_down_to_depth(node, group->depth, kernel->core); + tile = gpu_array_ref_group_tile(group); + node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); access = anchored_non_local_accesses(kernel, group, node, read); empty = isl_union_map_is_empty(access); @@ -3165,6 +3358,8 @@ access = isl_union_map_preimage_range_multi_aff(access, from_access); filter = isl_union_set_copy(kernel->thread_filter); + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + filter = isl_union_set_preimage_union_pw_multi_aff(filter, contraction); filter = isl_union_set_apply(filter, isl_union_map_copy(access)); filter = isl_union_set_detect_equalities(filter); filter = isl_union_set_coalesce(filter); @@ -3192,7 +3387,7 @@ node = isl_schedule_node_graft_before(node, graft); else { node = isl_schedule_node_graft_after(node, graft); - if (kernel_depth < group->depth) + if (kernel_depth < tile->depth) node = add_group_write_sync(node, kernel, group, 0); } @@ -3212,7 +3407,7 @@ * The copies are performed in the order of the corresponding shared * memory tile. * The copy statement instances include a reference to the outer - * group->depth dimensions of the kernel schedule for ease of + * tile->depth dimensions of the kernel schedule for ease of * combining them with the group tiling. * * If we are performing a read from global memory to shared memory and @@ -3228,7 +3423,7 @@ * * type[D -> A] -> T * - * where D corresponds to the outer group->depth dimensions of + * where D corresponds to the outer tile->depth dimensions of * the kernel schedule, A to the global array and T is the corresponding * shared memory tile. * @@ -3241,6 +3436,9 @@ * by the group. In the case of read from a non-scalar, this set * is replaced by the entire shared memory tile. * + * If the "unroll_copy_shared" option is set, then the AST generator + * is instructed to unroll the copying code. + * * A filter is inserted on type[D -> A] to map the copy instances * to the threads. In particular, the thread identifiers are * equated to the position inside the shared memory tile (T) @@ -3283,7 +3481,6 @@ struct gpu_array_tile *tile; isl_union_map *access; isl_union_set *domain; - isl_union_set *sync; isl_multi_aff *ma; isl_multi_aff *from_access; isl_multi_pw_aff *mpa; @@ -3294,8 +3491,9 @@ int kernel_depth; int empty; + tile = gpu_array_ref_group_tile(group); kernel_depth = isl_schedule_node_get_schedule_depth(node); - node = gpu_tree_move_down_to_depth(node, group->depth, kernel->core); + node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core); access = anchored_non_local_accesses(kernel, group, node, read); empty = isl_union_map_is_empty(access); @@ -3311,7 +3509,6 @@ from_access = create_from_access(kernel->ctx, group, read); - tile = gpu_array_ref_group_tile(group); ma = isl_multi_aff_copy(tile->tiling); ma = isl_multi_aff_pullback_multi_aff(ma, isl_multi_aff_copy(from_access)); @@ -3336,6 +3533,8 @@ graft = isl_schedule_node_child(graft, 0); graft = isl_schedule_node_insert_partial_schedule(graft, mupa); + if (kernel->options->unroll_copy_shared) + graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll); if (tile->n > kernel->n_block && kernel->n_block > 0) { graft = isl_schedule_node_band_split(graft, @@ -3359,14 +3558,14 @@ graft = isl_schedule_node_parent(graft); if (read) { - if (kernel_depth < group->depth) + if (kernel_depth < tile->depth) node = gpu_tree_ensure_sync_after_core(node, kernel); node = gpu_tree_move_left_to_sync(node, kernel); node = isl_schedule_node_graft_before(node, graft); } else { node = gpu_tree_move_right_to_sync(node, kernel); node = isl_schedule_node_graft_after(node, graft); - if (kernel_depth < group->depth) + if (kernel_depth < tile->depth) node = add_group_write_sync(node, kernel, group, 1); } @@ -3388,9 +3587,12 @@ struct ppcg_kernel *kernel, struct gpu_array_ref_group *group, __isl_take isl_schedule_node *node, int read) { - if (group->private_tile) + enum ppcg_group_access_type type; + + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_private) return add_copies_group_private(kernel, group, node, read); - if (group->shared_tile) + if (type == ppcg_access_shared) return add_copies_group_shared(kernel, group, node, read); return node; } @@ -3429,14 +3631,7 @@ */ static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node) { - int i, n; - - n = isl_schedule_node_band_n_member(node); - for (i = 0; i < n; ++i) - node = isl_schedule_node_band_member_set_ast_loop_type(node, i, - isl_ast_loop_atomic); - - return node; + return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); } /* Mark "node" atomic, if it is a band node. @@ -3507,14 +3702,22 @@ isl_union_map *equal; isl_union_set *wrap; isl_union_set *domain; + isl_union_pw_multi_aff *contraction; - domain = isl_schedule_node_get_universe_domain(node); kernel_prefix = isl_schedule_node_get_prefix_schedule_union_map(node); node = isl_schedule_node_copy(node); node = gpu_tree_move_down_to_thread(node, kernel->core); thread_prefix = isl_schedule_node_get_prefix_schedule_union_map(node); isl_schedule_node_free(node); + contraction = kernel->contraction; + kernel_prefix = isl_union_map_preimage_domain_union_pw_multi_aff( + kernel_prefix, isl_union_pw_multi_aff_copy(contraction)); + thread_prefix = isl_union_map_preimage_domain_union_pw_multi_aff( + thread_prefix, isl_union_pw_multi_aff_copy(contraction)); + domain = isl_union_set_copy(kernel->expanded_domain); + domain = isl_union_set_universe(domain); + may_writes = isl_union_map_copy(kernel->prog->scop->tagged_may_writes); may_writes = isl_union_map_curry(may_writes); may_writes = isl_union_map_intersect_domain(may_writes, domain); @@ -3564,7 +3767,10 @@ * The band that "node" points to is the band that needs to be mapped * to block identifiers. The band that needs to be mapped to thread * identifiers should be marked by a "thread" mark by the caller. - * This mark is removed by this function. + * The linear branch between the current node and the "thread" mark + * may also have a "shared" mark. If present, the mapping to shared + * memory is computed at that point. + * Both marks are removed by this function. * If "scale" is set, then the band that "node" points to is scaled * by "sizes". * @@ -3607,7 +3813,7 @@ * to be unrolled, then we perform the required unrolling. * * We save a copy of the schedule that may influence the mappings - * to shared or private memory in kernel->shared_schedule. + * to shared or private memory in kernel->copy_schedule. * * Finally, we add synchronization and copy statements to the schedule tree, * remove the "thread" mark and create representations for the local @@ -3617,7 +3823,7 @@ * that the kernel does not get destroyed if the schedule node * is freed due to some error condition. */ -static __isl_give isl_schedule_node *create_kernel(struct gpu_gen *gen, +__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, __isl_take isl_schedule_node *node, int scale, __isl_keep isl_multi_val *sizes) { @@ -3625,10 +3831,15 @@ isl_id *id; isl_schedule_node *node_thread; isl_union_map *host_schedule; + isl_union_pw_multi_aff *contraction; isl_set *host_domain; - isl_union_set *domain; + isl_union_set *domain, *expanded; int single_statement; + node = gpu_tree_insert_shared_before_thread(node); + if (!node) + return NULL; + kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel); kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog); if (!kernel) @@ -3642,8 +3853,13 @@ kernel->options = gen->options; kernel->context = extract_context(node, gen->prog); kernel->core = isl_union_set_universe(isl_union_set_copy(domain)); - kernel->arrays = accessed_by_domain(isl_union_set_copy(domain), - gen->prog); + contraction = isl_schedule_node_get_subtree_contraction(node); + kernel->contraction = isl_union_pw_multi_aff_copy(contraction); + expanded = isl_union_set_copy(domain); + expanded = isl_union_set_preimage_union_pw_multi_aff(expanded, + contraction); + kernel->expanded_domain = isl_union_set_copy(expanded); + kernel->arrays = accessed_by_domain(expanded, gen->prog); kernel->n_grid = n_outer_coincidence(node); node_thread = isl_schedule_node_copy(node); node_thread = gpu_tree_move_down_to_thread(node_thread, kernel->core); @@ -3693,7 +3909,8 @@ kernel->n_block, "t"); kernel->thread_filter = set_schedule_modulo(node, kernel->thread_ids, kernel->block_dim); - extract_block_size(kernel, domain); + if (extract_block_size(kernel, domain) < 0) + node = isl_schedule_node_free(node); node = gpu_tree_move_up_to_kernel(node); node = isl_schedule_node_child(node, 0); @@ -3726,16 +3943,22 @@ } node = gpu_tree_move_up_to_thread(node); - kernel->shared_schedule_dim = - isl_schedule_node_get_schedule_depth(node); - kernel->shared_schedule = + kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node); + kernel->copy_schedule = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node); + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + kernel->copy_schedule = + isl_union_pw_multi_aff_pullback_union_pw_multi_aff( + kernel->copy_schedule, contraction); node = gpu_tree_move_up_to_kernel(node); node = add_sync(kernel, node); node = add_copies(kernel, node); + node = gpu_tree_move_down_to_shared(node, kernel->core); + node = isl_schedule_node_delete(node); + node = gpu_tree_move_down_to_thread(node, kernel->core); node = isl_schedule_node_delete(node); @@ -3776,18 +3999,86 @@ return node; } +/* See if hybrid tiling can be performed on "node" and its parent. + * If so, apply hybrid tiling and return the updated schedule tree. + * If not, return the original schedule tree. + * Return NULL on error. + * + * First check if "node", together with its parent, meets + * the basic requirements for hybrid tiling. + * If so, compute the relative dependence distances of "node" + * with respect to its parent and check if they are sufficiently bounded. + * If so, apply hybrid tiling using user specified tile sizes. + * + * The tile sizes are read before the dependence distance bounds are + * computed, because the user may have specified fewer dimensions + * than are available. In this case, the remaining schedule dimensions + * are split off and the dependence distances should be computed + * after these dimensions have been split off. + */ +static __isl_give isl_schedule_node *try_hybrid_tile(struct gpu_gen *gen, + __isl_take isl_schedule_node *node) +{ + int tile_len; + int *tile_size; + isl_bool ok; + isl_schedule_node *orig = node; + ppcg_ht_bounds *bounds; + + ok = ppcg_ht_parent_has_input_pattern(node); + if (ok < 0) + return isl_schedule_node_free(node); + if (!ok) + return orig; + + tile_len = 1 + isl_schedule_node_band_n_member(node); + tile_size = read_tile_sizes(gen, &tile_len); + if (!tile_size) + return isl_schedule_node_free(node); + + node = isl_schedule_node_copy(node); + node = split_band(node, tile_len - 1); + node = isl_schedule_node_parent(node); + bounds = ppcg_ht_compute_bounds(gen->prog->scop, node); + node = isl_schedule_node_child(node, 0); + + ok = ppcg_ht_bounds_is_valid(bounds); + if (ok >= 0 && ok) + node = gpu_hybrid_tile(gen, node, bounds, tile_size); + else + ppcg_ht_bounds_free(bounds); + free(tile_size); + + if (ok >= 0 && !ok) { + isl_schedule_node_free(node); + return orig; + } + isl_schedule_node_free(orig); + if (ok < 0) + return isl_schedule_node_free(node); + return node; +} + /* If "node" is the outermost permutable band that can be mapped to block and - * thread identifiers in its branch (or a leaf with no such outer bands), + * thread identifiers in its branch (or the root of a subtree with + * no such outer bands), * then mark the band as such, attaching a ppcg_kernel to the mark. * - * If "node" originally points to a leaf, then insert a zero-dimensional - * permutable band such that we can assume that "node" always - * points to a band node. + * If hybrid tiling is allowed, then first try and apply it + * to "node" and its parent. + * + * If "node" is the root of a subtree without permutable bands, + * then insert a zero-dimensional permutable band such that + * we can assume that "node" always points to a band node. + * This includes the case where "node" already points to a band node, + * but one without any coincident dimension. In this case, + * the extra node ensures that this original node does not get tiled. * * Tile "node" using user specified tile sizes, after splitting the band * if the number of specified tile sizes is smaller than the dimension * of the band. Mark the point band of this tiling as the band that - * needs to be mapped to threads. + * needs to be mapped to threads and instruct the AST generator to unroll + * the band if the "unroll_gpu_tile" option is set. * Create a kernel representing the domain instances that reach "node" and * insert a mark node pointing to the ppcg_kernel before the band node. */ @@ -3808,7 +4099,16 @@ if (!outer) return node; - if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf) + if (gen->options->hybrid) { + isl_schedule_node *saved = isl_schedule_node_copy(node); + node = try_hybrid_tile(gen, node); + isl_schedule_node_free(saved); + if (node != saved) + return node; + } + + if (isl_schedule_node_get_type(node) != isl_schedule_node_band || + !isl_schedule_node_band_member_get_coincident(node, 0)) node = insert_empty_permutable_band(node); tile_len = isl_schedule_node_band_n_member(node); @@ -3820,46 +4120,106 @@ sizes = construct_band_tiles_sizes(node, tile_size); node = tile_band(node, isl_multi_val_copy(sizes)); node = isl_schedule_node_child(node, 0); + if (gen->options->unroll_gpu_tile) + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); id = isl_id_alloc(gen->ctx, "thread", NULL); node = isl_schedule_node_insert_mark(node, id); node = isl_schedule_node_parent(node); scale = gen->options->scale_tile_loops; - node = create_kernel(gen, node, scale, sizes); + node = gpu_create_kernel(gen, node, scale, sizes); isl_multi_val_free(sizes); free(tile_size); return node; } -/* Does the subtree rooted at "node" have any suitably permutable band nodes? - * That is, does it have any nodes that are permutable and that - * have a least one coincident dimension? +/* Given a set or sequence node, return the union the filters of either all + * (if "only_initial" is not set) or the initial (if "only_initial" is set) + * direct subtrees that do not contain any suitably permutable bands + * (according to subtree_has_permutable_bands). */ -static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node) +static __isl_give isl_union_set *get_non_parallel_subtree_filters( + __isl_keep isl_schedule_node *node, int only_initial) { - int any_parallelism = 0; + isl_space *space; + isl_union_set *filter; + int i, n; - if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable, - &any_parallelism) < 0 && - !any_parallelism) - return -1; + n = isl_schedule_node_n_children(node); + if (n < 0) + return NULL; - return any_parallelism; + node = isl_schedule_node_copy(node); + node = isl_schedule_node_child(node, 0); + filter = isl_schedule_node_filter_get_filter(node); + node = isl_schedule_node_parent(node); + space = isl_union_set_get_space(filter); + isl_union_set_free(filter); + filter = isl_union_set_empty(space); + + for (i = 0; i < n; ++i) { + int parallelism; + + node = isl_schedule_node_child(node, i); + parallelism = subtree_has_permutable_bands(node); + if (parallelism < 0) { + filter = isl_union_set_free(filter); + } else if (!parallelism) { + isl_union_set *filter_i; + filter_i = isl_schedule_node_filter_get_filter(node); + filter = isl_union_set_union(filter, filter_i); + } else if (only_initial) + break; + node = isl_schedule_node_parent(node); + } + + isl_schedule_node_free(node); + + return filter; +} + +/* Given a set or sequence node, return the union of the filters of + * the direct subtrees that do not contain any suitably permutable bands + * (according to subtree_has_permutable_bands). + */ +static __isl_give isl_union_set *get_all_non_parallel_subtree_filters( + __isl_keep isl_schedule_node *node) +{ + return get_non_parallel_subtree_filters(node, 0); +} + +/* Given a set or sequence node, return the union of the filters of + * the initial direct subtrees that do not contain any suitably permutable + * bands (according to subtree_has_permutable_bands). + */ +static __isl_give isl_union_set *get_initial_non_parallel_subtree_filters( + __isl_keep isl_schedule_node *node) +{ + return get_non_parallel_subtree_filters(node, 1); } /* Mark all variables that are accessed by the statement instances in "domain" * and that are local to "prog" as requiring a declaration in the host code. + * The statement instances in "domain" correspond to (a subset of) + * the active instances at "node". + * "node" is not modified by this function, except that NULL is returned + * in case of error. */ -static int declare_accessed_local_variables(struct gpu_prog *prog, +static __isl_give isl_schedule_node *declare_accessed_local_variables( + __isl_take isl_schedule_node *node, struct gpu_prog *prog, __isl_keep isl_union_set *domain) { + isl_union_pw_multi_aff *contraction; isl_union_set *arrays; int i; if (!ppcg_scop_any_hidden_declarations(prog->scop)) - return 0; - arrays = accessed_by_domain(isl_union_set_copy(domain), prog); + return node; + contraction = isl_schedule_node_get_subtree_contraction(node); + domain = isl_union_set_copy(domain); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction); + arrays = accessed_by_domain(domain, prog); for (i = 0; i < prog->n_array; ++i) { isl_space *space; @@ -3879,10 +4239,10 @@ } isl_union_set_free(arrays); - return 0; + return node; error: isl_union_set_free(arrays); - return -1; + return isl_schedule_node_free(node); } /* If "node" points to a set node, then separate its children @@ -3891,51 +4251,33 @@ * Adjust the schedule tree in order to execute the second group * after the first group and return a pointer to the first group, * assuming there are any such subtrees. - * Mark all local variables in "prog" that are accessed by - * the second group as requiring a declaration on the host. + * If "node" points to a sequence node, then separate the initial + * children that do not have suitably permutable bands and + * return a pointer to the subsequence of children that do have such bands, + * assuming there are any such subtrees. + * + * In both cases, mark all local variables in "prog" that are accessed by + * the group without permutable bands as requiring a declaration on the host. */ static __isl_give isl_schedule_node *isolate_permutable_subtrees( __isl_take isl_schedule_node *node, struct gpu_prog *prog) { - isl_space *space; isl_union_set *filter; - int i, n; + enum isl_schedule_node_type type; if (!node) return NULL; - if (isl_schedule_node_get_type(node) != isl_schedule_node_set) - return node; - - n = isl_schedule_node_n_children(node); - if (n < 0) - return isl_schedule_node_free(node); - - node = isl_schedule_node_child(node, 0); - filter = isl_schedule_node_filter_get_filter(node); - node = isl_schedule_node_parent(node); - space = isl_union_set_get_space(filter); - isl_union_set_free(filter); - filter = isl_union_set_empty(space); - - for (i = 0; i < n; ++i) { - int parallelism; - - node = isl_schedule_node_child(node, i); - parallelism = subtree_has_permutable_bands(node); - if (parallelism < 0) { - node = isl_schedule_node_free(node); - } else if (!parallelism) { - isl_union_set *filter_i; - filter_i = isl_schedule_node_filter_get_filter(node); - filter = isl_union_set_union(filter, filter_i); - } - node = isl_schedule_node_parent(node); + type = isl_schedule_node_get_type(node); + if (type == isl_schedule_node_set) { + filter = get_all_non_parallel_subtree_filters(node); + node = declare_accessed_local_variables(node, prog, filter); + node = isl_schedule_node_order_after(node, filter); + } else if (type == isl_schedule_node_sequence) { + filter = get_initial_non_parallel_subtree_filters(node); + node = declare_accessed_local_variables(node, prog, filter); + node = isl_schedule_node_order_before(node, filter); } - if (declare_accessed_local_variables(prog, filter) < 0) - node = isl_schedule_node_free(node); - node = isl_schedule_node_order_after(node, filter); - return node; } @@ -3983,51 +4325,6 @@ &mark_outer_permutable, gen); } -/* Save the schedule "schedule" to a file called "filename". - * The schedule is printed in block style. - */ -static void save_schedule(__isl_keep isl_schedule *schedule, - const char *filename) -{ - FILE *file; - isl_ctx *ctx; - isl_printer *p; - - if (!schedule) - return; - - file = fopen(filename, "w"); - if (!file) { - fprintf(stderr, "Unable to open '%s' for writing\n", filename); - return; - } - ctx = isl_schedule_get_ctx(schedule); - p = isl_printer_to_file(ctx, file); - p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); - p = isl_printer_print_schedule(p, schedule); - isl_printer_free(p); - fclose(file); -} - -/* Load and return a schedule from a file called "filename". - */ -static __isl_give isl_schedule *load_schedule(isl_ctx *ctx, - const char *filename) -{ - FILE *file; - isl_schedule *schedule; - - file = fopen(filename, "r"); - if (!file) { - fprintf(stderr, "Unable to open '%s' for reading\n", filename); - return NULL; - } - schedule = isl_schedule_read_from_file(ctx, file); - fclose(file); - - return schedule; -} - /* Construct schedule constraints from the dependences in prog->scop and * the array order dependences in prog->array_order. * @@ -4108,6 +4405,8 @@ * We derive schedule constraints from the dependences in gen->prog->scop * and then use isl to compute a schedule that has a parallel loop * in each tilable band. + * During the schedule construction, some statement instances + * may be grouped first based on the input schedule. */ static __isl_give isl_schedule *compute_schedule(struct gpu_gen *gen) { @@ -4115,7 +4414,8 @@ isl_schedule *schedule; sc = construct_schedule_constraints(gen->prog); - schedule = isl_schedule_constraints_compute_schedule(sc); + schedule = gen->prog->scop->schedule; + schedule = ppcg_compute_schedule(sc, schedule, gen->options); return schedule; } @@ -4265,30 +4565,27 @@ return schedule; } +/* Compute a schedule or determine the properties of the original schedule + * depending on the value of the "reschedule" option. + */ +static __isl_give isl_schedule *compute_or_set_properties(void *user) +{ + struct gpu_gen *gen = user; + + if (gen->options->reschedule) + return compute_schedule(gen); + else + return determine_properties_original_schedule(gen); +} + /* Obtain a schedule for the scop, by reading it from * a file, by computing one or by determining the properties * of the original schedule. */ -__isl_give isl_schedule *get_schedule(struct gpu_gen *gen) +static __isl_give isl_schedule *get_schedule(struct gpu_gen *gen) { - isl_schedule *schedule; - - if (gen->options->load_schedule_file) { - schedule = load_schedule(gen->ctx, - gen->options->load_schedule_file); - } else { - if (gen->options->reschedule) - schedule = compute_schedule(gen); - else - schedule = determine_properties_original_schedule(gen); - if (gen->options->save_schedule_file) - save_schedule(schedule, - gen->options->save_schedule_file); - } - if (gen->options->debug->dump_schedule) - isl_schedule_dump(schedule); - - return schedule; + return ppcg_get_schedule(gen->ctx, gen->options, + &compute_or_set_properties, gen); } /* Construct the string "_". @@ -4686,7 +4983,6 @@ isl_space *space; isl_union_pw_multi_aff *contraction; isl_union_set *before, *after, *filter; - isl_union_map *flow; type = isl_schedule_node_get_parent_type(node); if (type != isl_schedule_node_sequence && type != isl_schedule_node_set) @@ -4785,7 +5081,6 @@ __isl_keep isl_schedule_node *node, struct gpu_prog *prog) { struct ppcg_may_persist_data data; - isl_schedule_node *root; isl_union_pw_multi_aff *contraction; isl_union_set *domain; isl_union_set *persist; @@ -4826,11 +5121,11 @@ /* Add nodes for copying outer arrays in and out of the device * before and after the subtree "node", which contains one or more kernels. - * "domain" contains the original reaching domain elements before - * the kernels were created, i.e., before the contraction that - * may have been performed in creating the kernels has been applied. + * "domain" contains the original statement instances, i.e., + * those that correspond to the domains of the access relations in "prog". + * In particular, the domain has not been contracted in any way. * "prefix" contains the prefix schedule at that point, in terms - * of the same original reaching domain elements. + * of the same original statement instances. * * We first compute the sets of outer array elements that need * to be copied in and out and then graft in the nodes for @@ -4868,7 +5163,7 @@ __isl_take isl_union_map *prefix, struct gpu_prog *prog) { isl_union_set *local; - isl_union_set *to_device, *from_device, *may_persist; + isl_union_set *may_persist; isl_union_map *may_write, *must_write, *copy_out, *not_written; isl_union_map *read, *copy_in; isl_union_map *tagged; @@ -4932,38 +5227,90 @@ return node; } +/* Add nodes for initializing ("init_device") and clearing ("clear_device") + * the device before and after "node". + */ +static __isl_give isl_schedule_node *add_init_clear_device( + __isl_take isl_schedule_node *node) +{ + isl_ctx *ctx; + isl_space *space; + isl_union_set *domain; + isl_schedule_node *graft; + + ctx = isl_schedule_node_get_ctx(node); + + space = isl_space_set_alloc(ctx, 0, 0); + space = isl_space_set_tuple_name(space, isl_dim_set, "init_device"); + domain = isl_union_set_from_set(isl_set_universe(space)); + graft = isl_schedule_node_from_domain(domain); + + node = isl_schedule_node_graft_before(node, graft); + + space = isl_space_set_alloc(ctx, 0, 0); + space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device"); + domain = isl_union_set_from_set(isl_set_universe(space)); + graft = isl_schedule_node_from_domain(domain); + + node = isl_schedule_node_graft_after(node, graft); + + return node; +} + /* Update "schedule" for mapping to a GPU device. * * In particular, insert a context node, create kernels for - * each outermost tilable band and introduce node for copying array - * in and out of the device. + * each outermost tilable band and introduce nodes for copying arrays + * in and out of the device and for initializing and clearing the device. * If the child of the initial root points to a set node, * then children of this node that do not contain any tilable bands * are separated from the other children and are not mapped to * the device. + * + * The GPU code is generated in a context where at least one + * statement instance is executed. The corresponding guard is inserted + * around the entire schedule. */ -__isl_give isl_schedule *map_to_device(struct gpu_gen *gen, +static __isl_give isl_schedule *map_to_device(struct gpu_gen *gen, __isl_take isl_schedule *schedule) { isl_schedule_node *node; isl_set *context; + isl_set *guard; isl_union_set *domain; isl_union_map *prefix; + isl_union_pw_multi_aff *contraction; + struct gpu_prog *prog; context = isl_set_copy(gen->prog->context); context = isl_set_from_params(context); schedule = isl_schedule_insert_context(schedule, context); + prog = gen->prog; + guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain)); + prog->context = isl_set_intersect(prog->context, isl_set_copy(guard)); + guard = isl_set_from_params(guard); + node = isl_schedule_get_root(schedule); isl_schedule_free(schedule); node = isl_schedule_node_child(node, 0); - if (isl_schedule_node_get_type(node) == isl_schedule_node_context) - node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); node = isolate_permutable_subtrees(node, gen->prog); domain = isl_schedule_node_get_domain(node); + contraction = isl_schedule_node_get_subtree_contraction(node); + domain = isl_union_set_preimage_union_pw_multi_aff(domain, + isl_union_pw_multi_aff_copy(contraction)); prefix = isl_schedule_node_get_prefix_schedule_union_map(node); + prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix, + contraction); node = mark_kernels(gen, node); node = add_to_from_device(node, domain, prefix, gen->prog); + node = isl_schedule_node_root(node); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_insert_guard(node, guard); + node = isl_schedule_node_child(node, 0); + node = add_init_clear_device(node); schedule = isl_schedule_node_get_schedule(node); isl_schedule_node_free(node); @@ -5032,13 +5379,88 @@ return NULL; } +/* Does the index expression "index" of "expr" represent an access + * to a single element? + * That is, is "index" completely specified? + * + * If "expr" accesses elements from different spaces (i.e., fields + * of a structure), then it does not access a single element. + * Otherwise, if the single space of the access matches the space + * of "index", then the index expression is completely specified + * (no pointer to a lower-dimensional slice of the accessed array) + * and a single element is being accessed. + */ +static isl_bool complete_index(__isl_keep pet_expr *expr, + __isl_keep isl_multi_pw_aff *index) +{ + isl_union_map *read, *write, *all; + isl_map *map; + isl_space *space1, *space2; + isl_bool complete; + + read = pet_expr_access_get_may_read(expr); + write = pet_expr_access_get_may_write(expr); + all = isl_union_map_union(read, write); + if (!all) + return isl_bool_error; + if (isl_union_map_n_map(all) != 1) { + isl_union_map_free(all); + return isl_bool_false; + } + map = isl_map_from_union_map(all); + space1 = isl_map_get_space(map); + isl_map_free(map); + space2 = isl_multi_pw_aff_get_space(index); + complete = isl_space_tuple_is_equal(space1, isl_dim_out, + space2, isl_dim_out); + isl_space_free(space1); + isl_space_free(space2); + + return complete; +} + +/* Does "expr" access a single, fixed element (independently of the statement + * instance)? + * That is, does it have a completely specified constant index expression? + * + * Note that it is not sufficient for the index expression to be + * piecewise constant. isl_multi_pw_aff_is_cst can therefore not be used. + */ +static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr) +{ + int i, n; + isl_multi_pw_aff *index; + isl_bool fixed = isl_bool_true; + + index = pet_expr_access_get_index(expr); + if (index < 0) + return isl_bool_error; + n = isl_multi_pw_aff_dim(index, isl_dim_out); + for (i = 0; i < n; ++i) { + isl_pw_aff *pa; + + pa = isl_multi_pw_aff_get_pw_aff(index, 0); + fixed = isl_pw_aff_n_piece(pa) == 1; + if (fixed) + fixed = isl_pw_aff_is_cst(pa); + isl_pw_aff_free(pa); + if (fixed < 0 || !fixed) + break; + } + if (fixed >= 0 && fixed) + fixed = complete_index(expr, index); + isl_multi_pw_aff_free(index); + + return fixed; +} + /* Extract a gpu_stmt_access from "expr", append it to the list * that ends in *data->next_access and update the end of the list. * If the access expression performs a write, then it is considered * exact only if it appears in a single expression statement and * if its may access relation is equal to its must access relation. * - * The combined set of may accesses may be union if member accesses + * The combined set of may accesses may be a union if member accesses * are involved, but the entire set is derived from a single reference and * therefore from a single index expression. These accesses therefore * all map to the same outer array. @@ -5081,11 +5503,12 @@ access->tagged_access = extract_single_tagged_access(tagged, expr); access->access = isl_map_copy(access->tagged_access); access->access = isl_map_domain_factor_domain(access->access); + access->fixed_element = accesses_fixed_element(expr); *data->next_access = access; data->next_access = &(*data->next_access)->next; - if (!access->access) + if (!access->access || access->fixed_element < 0) return -1; return 0; @@ -5109,10 +5532,31 @@ &extract_access, &data); } +/* Has statement "stmt" been killed from "scop"? + * That is, is the instance set of "scop" free from any + * instances of "stmt"? + */ +static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt) +{ + isl_space *space; + isl_set *left; + isl_bool empty; + + if (!scop || !stmt) + return isl_bool_error; + space = isl_set_get_space(stmt->domain); + left = isl_union_set_extract_set(scop->domain, space); + empty = isl_set_plain_is_empty(left); + isl_set_free(left); + + return empty; +} + /* Return an array of gpu_stmt representing the statements in "scop". + * Do not collect array accesses for statements that have been killed. */ static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop, - __isl_keep isl_set *context, __isl_keep isl_union_map *any_to_outer) + __isl_keep isl_union_map *any_to_outer) { int i; struct gpu_stmt *stmts; @@ -5123,9 +5567,15 @@ for (i = 0; i < scop->pet->n_stmt; ++i) { struct gpu_stmt *s = &stmts[i]; + isl_bool killed; s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain); s->stmt = scop->pet->stmts[i]; + killed = is_stmt_killed(scop, scop->pet->stmts[i]); + if (killed < 0) + return free_stmts(stmts, i + 1); + if (killed) + continue; if (pet_stmt_extract_accesses(s, any_to_outer) < 0) return free_stmts(stmts, i + 1); } @@ -5133,16 +5583,6 @@ return stmts; } -/* Callback for ppcg_print_guarded that calls the callback for generate_gpu. - */ -static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user) -{ - struct gpu_gen *gen = user; - - return gen->print(p, gen->prog, gen->tree, &gen->types, - gen->print_user); -} - /* Generate CUDA code for "scop" and print it to "p". * After generating an AST for the transformed scop as explained below, * we call "gen->print" to print the AST in the desired output format @@ -5151,11 +5591,9 @@ * If it turns out that it does not make sense to generate GPU code, * then we generate CPU code instead. * - * The GPU code is generated in a context where at least one - * statement instance is executed. The corresponding guard (if any) is printed - * around the entire generated GPU code, except for the declaration - * of the arrays that are visible outside of the scop and that therefore - * cannot be declared inside the body of any possible guard. + * The declarations of the arrays that are visible outside of the scop + * are printed outside of the code generated from the schedule, + * because the generated code may involve a guard around the entire code. * * We first compute a schedule that respects the dependences * of the original program and select the outermost bands @@ -5210,7 +5648,6 @@ { struct gpu_prog *prog; isl_ctx *ctx; - isl_set *context, *guard; isl_schedule *schedule; int any_permutable; @@ -5222,17 +5659,11 @@ if (!prog) return isl_printer_free(p); - context = isl_set_copy(prog->context); - guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain)); - prog->context = isl_set_intersect(prog->context, isl_set_copy(guard)); - gen->prog = prog; schedule = get_schedule(gen); any_permutable = has_any_permutable_node(schedule); if (any_permutable < 0 || !any_permutable) { - isl_set_free(context); - isl_set_free(guard); if (any_permutable < 0) p = isl_printer_free(p); else @@ -5241,9 +5672,10 @@ } else { schedule = map_to_device(gen, schedule); gen->tree = generate_code(gen, schedule); - p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p); + p = ppcg_set_macro_names(p); p = ppcg_print_exposed_declarations(p, prog->scop); - p = ppcg_print_guarded(p, guard, context, &print_gpu, gen); + p = gen->print(p, gen->prog, gen->tree, &gen->types, + gen->print_user); isl_ast_node_free(gen->tree); } @@ -5309,7 +5741,7 @@ * arrays that are not local to "prog" and remove those elements that * are definitely killed or definitely written by "prog". */ -__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog) +static __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog) { int i; isl_union_set *may_persist, *killed; @@ -5363,8 +5795,7 @@ space = isl_space_map_from_set(space); id = isl_map_identity(space); prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id); - prog->stmts = extract_stmts(ctx, scop, - prog->context, prog->any_to_outer); + prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer); prog->read = isl_union_map_copy(scop->reads); prog->may_write = isl_union_map_copy(scop->may_writes); prog->must_write = isl_union_map_copy(scop->must_writes); Index: polly/trunk/lib/External/ppcg/gpu_array_tile.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu_array_tile.h +++ polly/trunk/lib/External/ppcg/gpu_array_tile.h @@ -9,7 +9,7 @@ * if shift != NULL. * If so, they express that current index is such that if you add shift, * then the result is always a multiple of stride. - * Let D represent the initial group->depth dimensions of the computed schedule. + * Let D represent the initial tile->depth dimensions of the computed schedule. * The spaces of "lb" and "shift" are of the form * * D -> [b] @@ -22,11 +22,14 @@ isl_aff *shift; }; -/* A tile of an array. +/* A tile of an outer array. * * requires_unroll is set if the schedule dimensions that are mapped * to threads need to be unrolled for this (private) tile to be used. * + * "depth" reflects the number of schedule dimensions that affect the tile. + * The copying into and/or out of the tile is performed at that depth. + * * n is the dimension of the array. * bound is an array of size "n" representing the lower bound * and size for each index. @@ -36,12 +39,13 @@ * * { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] } * - * where D represents the initial group->depth dimensions + * where D represents the initial "depth" dimensions * of the computed schedule. */ struct gpu_array_tile { isl_ctx *ctx; int requires_unroll; + int depth; int n; struct gpu_array_bound *bound; isl_multi_aff *tiling; Index: polly/trunk/lib/External/ppcg/gpu_group.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu_group.h +++ polly/trunk/lib/External/ppcg/gpu_group.h @@ -10,10 +10,6 @@ * Otherwise, it is accessed from global memory. * Note that if both private_tile and shared_tile are set, then shared_tile * is only used inside group_common_shared_memory_tile. - * "depth" reflects the number of schedule dimensions that affect the tile - * (private_tile if set; shared_tile if shared_tile is set and private_tile - * is not). The copying into and/or out of the tile is performed at that - * depth. */ struct gpu_array_ref_group { /* The references in this group access this local array. */ @@ -24,18 +20,20 @@ int nr; /* The following fields are use during the construction of the groups. - * access is the combined access relation relative to the shared + * access is the combined access relation relative to the private * memory tiling. In particular, the domain of the map corresponds - * to the first shared_schedule_dim dimensions of the kernel schedule. + * to the first thread_depth dimensions of the kernel schedule. * write is set if any access in the group is a write. * exact_write is set if all writes are definite writes. * slice is set if there is at least one access in the group * that refers to more than one element + * "min_depth" is the minimum of the tile depths and thread_depth. */ isl_map *access; int write; int exact_write; int slice; + int min_depth; /* The shared memory tile, NULL if none. */ struct gpu_array_tile *shared_tile; @@ -43,8 +41,6 @@ /* The private memory tile, NULL if none. */ struct gpu_array_tile *private_tile; - int depth; - /* References in this group; point to elements of a linked list. */ int n_ref; struct gpu_stmt_access **refs; @@ -59,6 +55,8 @@ __isl_give isl_union_map *gpu_array_ref_group_access_relation( struct gpu_array_ref_group *group, int read, int write); int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group); +enum ppcg_group_access_type gpu_array_ref_group_type( + struct gpu_array_ref_group *group); struct gpu_array_tile *gpu_array_ref_group_tile( struct gpu_array_ref_group *group); struct gpu_array_ref_group *gpu_array_ref_group_free( Index: polly/trunk/lib/External/ppcg/gpu_group.c =================================================================== --- polly/trunk/lib/External/ppcg/gpu_group.c +++ polly/trunk/lib/External/ppcg/gpu_group.c @@ -1,3 +1,16 @@ +/* + * Copyright 2010-2011 INRIA Saclay + * Copyright 2012-2014 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France, + * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod, + * 91893 Orsay, France + * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + #include #include @@ -12,10 +25,12 @@ struct gpu_array_ref_group *group, __isl_take isl_printer *p) { int global = 0; + enum ppcg_group_access_type type; - if (group->private_tile) + type = gpu_array_ref_group_type(group); + if (type == ppcg_access_private) p = isl_printer_print_str(p, "private_"); - else if (group->shared_tile) + else if (type == ppcg_access_shared) p = isl_printer_print_str(p, "shared_"); else global = 1; @@ -52,19 +67,40 @@ return access; } -/* Return the effective gpu_array_tile associated to "group" or - * NULL if there is no such gpu_array_tile. +/* Should this array reference group be mapped to private, shared or global + * memory? * If we have computed both a private and a shared tile, then - * the private tile is used. + * the tile with the smallest depth is used. If both have the same depth, + * then the private tile is used. */ -struct gpu_array_tile *gpu_array_ref_group_tile( +enum ppcg_group_access_type gpu_array_ref_group_type( struct gpu_array_ref_group *group) { + if (group->private_tile && group->shared_tile && + group->shared_tile->depth < group->private_tile->depth) + return ppcg_access_shared; if (group->private_tile) - return group->private_tile; + return ppcg_access_private; if (group->shared_tile) + return ppcg_access_shared; + return ppcg_access_global; +} + + +/* Return the effective gpu_array_tile associated to "group" or + * NULL if there is no such gpu_array_tile. + */ +struct gpu_array_tile *gpu_array_ref_group_tile( + struct gpu_array_ref_group *group) +{ + switch (gpu_array_ref_group_type(group)) { + case ppcg_access_global: + return NULL; + case ppcg_access_shared: return group->shared_tile; - return NULL; + case ppcg_access_private: + return group->private_tile; + } } /* Does the tile associated to "group" require unrolling of the schedule @@ -371,11 +407,15 @@ * * We project the accesses on each index in turn and look for a parametric * offset such that the size is constant. + * + * tile->depth is initialized to the input dimension of the computed bounds. */ static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile) { int i; + tile->depth = isl_map_dim(access, isl_dim_in); + for (i = 0; i < tile->n; ++i) { isl_map *access_i; isl_basic_map *hull; @@ -399,9 +439,12 @@ * kernel_depth is the schedule depth where the kernel launch will * be introduced, i.e., it is the depth of the band that is mapped * to blocks. + * shared_depth is the schedule depth at which the copying to/from + * shared memory is computed. The copy operation may then + * later be hoisted to a higher level. * thread_depth is the schedule depth where the thread mark is located, * i.e., it is the depth of the band that is mapped to threads and also - * the schedule depth at which the copying to/from shared/private memory + * the schedule depth at which the copying to/from private memory * is computed. The copy operation may then later be hoisted to * a higher level. * n_thread is the number of schedule dimensions in the band that @@ -410,20 +453,27 @@ * of dimension thread_depth + n_thread) and encodes the mapping * to thread identifiers (as parameters). * host_sched contains the kernel_depth dimensions of the host schedule. - * shared_sched contains the first thread_depth dimensions of the + * shared_sched contains the first shared_depth dimensions of the + * kernel schedule. + * copy_sched contains the first thread_depth dimensions of the * kernel schedule. * thread_sched contains the first (thread_depth + n_thread) dimensions * of the kernel schedule. * full_sched is a union_map representation of the entire kernel schedule. + * The schedules are all formulated in terms of the original statement + * instances, i.e., those that appear in the domains of the access + * relations. */ struct gpu_group_data { struct ppcg_scop *scop; int kernel_depth; + int shared_depth; int thread_depth; int n_thread; isl_set *privatization; isl_union_map *host_sched; isl_union_map *shared_sched; + isl_union_map *copy_sched; isl_union_map *thread_sched; isl_union_map *full_sched; }; @@ -466,6 +516,7 @@ static int access_is_coalesced(struct gpu_group_data *data, __isl_keep isl_union_map *access) { + int dim; isl_space *space; isl_set *accessed; isl_map *access_map; @@ -481,7 +532,11 @@ space = isl_map_get_space(access_map); space = isl_space_range(space); - next_element = next(space, isl_space_dim(space, isl_dim_set) - 1); + dim = isl_space_dim(space, isl_dim_set); + if (dim == 0) + next_element = isl_map_empty(isl_space_map_from_set(space)); + else + next_element = next(space, dim - 1); accessed = isl_map_range(isl_map_copy(access_map)); map = isl_map_copy(next_element); @@ -576,7 +631,7 @@ { int i, j; - for (j = data->thread_depth - 1; j >= data->kernel_depth; --j) { + for (j = tile->depth - 1; j >= data->kernel_depth; --j) { for (i = 0; i < tile->n; ++i) { isl_aff *lb; isl_aff *shift; @@ -598,57 +653,156 @@ return ++j; } -/* Adjust the fields of "tile" to reflect the new input dimension "new_dim", - * where "old_dim" is the old dimension. - * The dimension beyond "new_dim" are assumed not to affect the tile, +/* Return the lowest depth between data->kernel_depth and data->thread_depth + * at which every array element accessed through "acc" is accessed + * by a single thread. The input dimension of "acc" is + * data->thread_depth + data->n_thread, where the final data->n_thread + * dimensions are those that will be mapped to threads. + * If the values for these dimensions are uniquely determined + * by the array index and a given number of outer dimensions, then + * there is only one thread accessing that array element within those + * outer dimensions. + * + * The input space of "acc" is first split up, such that it has the form + * + * [O -> T] -> A + * + * with O the outer dimensions, T the dimensions that will be mapped to threads + * and A the array index. + * + * Then the positions of T and A are interchanged to simplify the test + * whether T uniquely depends on O and A. + * In particular, the above access relation is first combined with + * + * [O -> T] -> T + * + * to form + * + * [O -> T] -> [A -> T] + * + * from which + * + * O -> [A -> T] + * + * is extracted, which is then uncurried to + * + * [O -> A] -> T + * + * Finally, the final dimensions of O are projected out one by one + * until T is no longer uniquely determined by A and the remaining + * dimensions in O. The value returned is that of the last dimension + * that was successfully projected out. + * Note that there is no need to test whether [O -> A] -> T itself + * is single-valued as that was already tested in access_is_bijective. + */ +static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data, + __isl_keep isl_map *acc) +{ + int i; + isl_space *space; + isl_map *map; + isl_bool sv; + + if (data->thread_depth == data->kernel_depth) + return data->thread_depth; + + acc = isl_map_copy(acc); + + space = isl_map_get_space(acc); + space = isl_space_params(space); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, data->thread_depth); + space = isl_space_from_domain(space); + space = isl_space_add_dims(space, isl_dim_out, data->n_thread); + space = isl_space_wrap(space); + map = isl_set_flatten_map(isl_set_universe(space)); + acc = isl_map_apply_range(map, acc); + + space = isl_space_domain(isl_map_get_space(acc)); + map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space))); + acc = isl_map_range_product(acc, map); + acc = isl_map_domain_factor_domain(acc); + acc = isl_map_uncurry(acc); + + for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) { + acc = isl_map_project_out(acc, isl_dim_in, i, 1); + sv = isl_map_is_single_valued(acc); + if (sv < 0) + return -1; + if (!sv) + break; + } + + isl_map_free(acc); + + return ++i; +} + +/* Adjust the fields of "tile" to reflect the new input dimension "depth". + * The dimension beyond "depth" are assumed not to affect the tile, * so they can simply be dropped. */ -static int tile_adjust_depth(struct gpu_array_tile *tile, - int old_dim, int new_dim) +static int tile_adjust_depth(struct gpu_array_tile *tile, int depth) { int i; - if (old_dim == new_dim) + if (tile->depth == depth) return 0; for (i = 0; i < tile->n; ++i) { tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb, - isl_dim_in, new_dim, old_dim - new_dim); + isl_dim_in, depth, tile->depth - depth); if (!tile->bound[i].lb) return -1; if (!tile->bound[i].shift) continue; tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift, - isl_dim_in, new_dim, old_dim - new_dim); + isl_dim_in, depth, tile->depth - depth); if (!tile->bound[i].shift) return -1; } + tile->depth = depth; + return 0; } /* Determine the number of schedule dimensions that affect the offset of the - * shared or private tile and store the result in group->depth, with + * shared or private tile "tile" and store the result in tile->depth, with * a lower bound of data->kernel_depth. - * If there is no tile defined on the array reference group, - * then set group->depth to data->thread_depth. - * Also adjust the fields of the tile to only refer to the group->depth + * Also adjust the fields of the tile to only refer to the tile->depth * outer schedule dimensions. */ -static int set_depth(struct gpu_group_data *data, - struct gpu_array_ref_group *group) +static isl_stat tile_set_depth(struct gpu_group_data *data, + struct gpu_array_tile *tile) { - struct gpu_array_tile *tile; + if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0) + return isl_stat_error; - group->depth = data->thread_depth; + return isl_stat_ok; +} - tile = gpu_array_ref_group_tile(group); - if (!tile) - return 0; +/* Determine the number of schedule dimensions that affect the offset of the + * shared tile and store the minimum of the private and shared tile depth + * in group->min_depth, with a lower bound of data->kernel_depth. + * If there is no tile defined on the array reference group, + * then set group->min_depth to data->thread_depth. + */ +static int set_depth(struct gpu_group_data *data, + struct gpu_array_ref_group *group) +{ + group->min_depth = data->thread_depth; - group->depth = compute_tile_depth(data, tile); - if (tile_adjust_depth(tile, data->thread_depth, group->depth) < 0) - return -1; + if (group->private_tile) { + if (group->private_tile->depth < group->min_depth) + group->min_depth = group->private_tile->depth; + } + if (group->shared_tile) { + if (tile_set_depth(data, group->shared_tile) < 0) + return -1; + if (group->shared_tile->depth < group->min_depth) + group->min_depth = group->shared_tile->depth; + } return 0; } @@ -666,7 +820,7 @@ { int i; int n; - isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched); + isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched); n = 0; for (i = 0; i < local->array->n_ref; ++i) { @@ -678,7 +832,7 @@ map = isl_map_copy(access->access); umap = isl_union_map_from_map(map); umap = isl_union_map_apply_domain(umap, - isl_union_map_copy(data->shared_sched)); + isl_union_map_copy(data->copy_sched)); if (isl_union_map_is_empty(umap)) { isl_union_map_free(umap); @@ -727,7 +881,7 @@ } /* Check if the access relations of group1 and group2 overlap within - * shared_sched. + * copy_sched. */ static int accesses_overlap(struct gpu_array_ref_group *group1, struct gpu_array_ref_group *group2) @@ -846,6 +1000,24 @@ return !bijective; } +/* Map the domain of "access" to the outer data->shared_depth + * schedule dimensions. When data->shared_depth is equal to + * data->thread_depth, this result is already available in group->access. + */ +static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group, + __isl_keep isl_union_map *access, struct gpu_group_data *data) +{ + isl_union_map *shared; + + if (data->shared_depth == data->thread_depth) + return isl_map_copy(group->access); + + shared = isl_union_map_copy(access); + shared = isl_union_map_apply_domain(shared, + isl_union_map_copy(data->shared_sched)); + return isl_map_from_union_map(shared); +} + /* Compute the private and/or shared memory tiles for the array * reference group "group" of array "array". * Return 0 on success and -1 on error. @@ -883,6 +1055,15 @@ * and then they could be allowed to access the same memory elements, * but our check does not allow this situation. * + * For private memory tiles, the number of schedule dimensions that + * affect the offset is computed and stored in tile->depth, with + * a lower bound of data->kernel_depth. If this depth is smaller + * than the minimal depth that still ensures that every element + * is accessed by a single thread, then the depth is raised + * to this minimal depth. + * The fields of the tile are then adjusted to only refer to the tile->depth + * outer schedule dimensions. + * * We also check that the index expression only depends on parallel * loops. That way, we can move those loops innermost and unroll them. * Again, we use a test that is stricter than necessary. @@ -901,7 +1082,7 @@ * that are forcibly mapped to private memory. * * If the array is marked force_private, then we bypass all checks - * and assume we can (and should) use registers. + * and assume we can (and should) use registers only. * * If it turns out we can (or have to) use registers, we compute * the private memory tile size using can_tile, after introducing a dependence @@ -916,11 +1097,12 @@ int no_reuse, coalesced; isl_map *acc; int force_private = group->local_array->force_private; - int use_shared = kernel->options->use_shared_memory && + int use_shared = !force_private && kernel->options->use_shared_memory && data->n_thread > 0; int use_private = force_private || kernel->options->use_private_memory; int r = 0; int requires_unroll; + int unique_depth; if (!use_shared && !use_private) return 0; @@ -947,11 +1129,13 @@ if (use_shared && (!no_reuse || !coalesced)) { group->shared_tile = gpu_array_tile_create(ctx, group->array->n_index); + acc = shared_access(group, access, data); if (!group->shared_tile) r = -1; - else if (!can_tile(group->access, group->shared_tile)) + else if (!can_tile(acc, group->shared_tile)) group->shared_tile = gpu_array_tile_free(group->shared_tile); + isl_map_free(acc); } if (r < 0 || (!force_private && (!use_private || no_reuse))) { @@ -969,11 +1153,13 @@ return 0; } + unique_depth = compute_accessed_by_single_thread_depth(data, acc); + acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization)); acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth, data->n_thread); requires_unroll = check_requires_unroll(data, acc, force_private); - if (requires_unroll < 0 || + if (unique_depth < 0 || requires_unroll < 0 || (requires_unroll && kernel->any_force_private)) { isl_map_free(acc); return requires_unroll < 0 ? -1 : 0; @@ -990,6 +1176,15 @@ isl_map_free(acc); + if (group->private_tile) { + struct gpu_array_tile *tile = group->private_tile; + int tile_depth = compute_tile_depth(data, tile); + if (tile_depth < unique_depth) + tile_depth = unique_depth; + if (tile_adjust_depth(tile, tile_depth) < 0) + return -1; + } + if (force_private && !group->private_tile) isl_die(ctx, isl_error_internal, "unable to map array reference group to registers", @@ -1071,7 +1266,7 @@ } /* Check if the access relations of group1 and group2 overlap within - * the outermost min(group1->depth, group2->depth) loops. + * the outermost min(group1->min_depth, group2->min_depth) loops. */ static int depth_accesses_overlap(struct gpu_array_ref_group *group1, struct gpu_array_ref_group *group2) @@ -1081,9 +1276,9 @@ int empty; isl_map *map_i, *map_j, *map; - depth = group1->depth; - if (group2->depth < depth) - depth = group2->depth; + depth = group1->min_depth; + if (group2->min_depth < depth) + depth = group2->min_depth; map_i = isl_map_copy(group1->access); dim = isl_map_dim(map_i, isl_dim_in); map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth); @@ -1150,25 +1345,17 @@ { int i, j; int recompute_overlap = 0; - isl_ctx *ctx = isl_space_get_ctx(array->space); for (i = 0; i < n; ++i) { if (!groups[i]->shared_tile) continue; for (j = n - 1; j > i; --j) { - isl_map *map; - int empty; struct gpu_array_ref_group *group; if (!groups[j]->shared_tile) continue; - map = isl_map_intersect(isl_map_copy(groups[i]->access), - isl_map_copy(groups[j]->access)); - empty = isl_map_is_empty(map); - isl_map_free(map); - - if (empty) + if (!depth_accesses_overlap(groups[i], groups[j])) continue; group = join_groups(groups[i], groups[j]); @@ -1184,8 +1371,8 @@ continue; } - if (group->depth < groups[i]->depth || - group->depth < groups[j]->depth) + if (group->min_depth < groups[i]->min_depth || + group->min_depth < groups[j]->min_depth) recompute_overlap = 1; gpu_array_ref_group_free(groups[i]); gpu_array_ref_group_free(groups[j]); @@ -1208,7 +1395,7 @@ static void set_array_groups(struct gpu_local_array_info *array, int n, struct gpu_array_ref_group **groups) { - int i, j; + int i; array->n_group = n; array->groups = groups; @@ -1251,7 +1438,8 @@ * If the array contains structures, then we compute a single * reference group without trying to find any tiles * since we do not map such arrays to private or shared - * memory. + * memory. The only exception is when those arrays of structures + * are required to be mapped to private memory. */ static int group_array_references(struct ppcg_kernel *kernel, struct gpu_local_array_info *local, struct gpu_group_data *data) @@ -1268,7 +1456,7 @@ n = populate_array_references(local, groups, data); - if (local->array->has_compound_element) { + if (local->array->has_compound_element && !local->force_private) { n = join_all_groups(n, groups); set_array_groups(local, n, groups); return 0; @@ -1295,42 +1483,51 @@ return -1; } -/* For each scalar in the input program, check if there are any - * order dependences active inside the current kernel, within - * the same iteration of "host_schedule". - * If so, mark the scalar as force_private so that it will be - * mapped to a register. +/* For each array in the input program that can be mapped to private memory, + * check if there are any order dependences active inside the current kernel, + * within the same iteration of the host schedule, i.e., the prefix + * schedule at "node". + * If so, mark the array as force_private so that its reference groups will be + * mapped to a registers. + * + * Note that the arrays that cannot be mapped to private memory have + * had their order dependences added to prog->array_order and + * subsequently to the coincidence constraints. */ -static void check_scalar_live_ranges_in_host(struct ppcg_kernel *kernel, - __isl_take isl_union_map *host_schedule) +static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel, + __isl_keep isl_schedule_node *node) { int i; - isl_union_map *sched; isl_union_set *domain; - isl_union_map *same_host_iteration; + isl_multi_union_pw_aff *prefix; + isl_union_pw_multi_aff *contraction; - kernel->any_force_private = 0; + if (!kernel->options->live_range_reordering) + return; - sched = isl_union_map_universe(isl_union_map_copy(host_schedule)); - domain = isl_union_map_domain(sched); + kernel->any_force_private = 0; - same_host_iteration = isl_union_map_apply_range(host_schedule, - isl_union_map_reverse(isl_union_map_copy(host_schedule))); + prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix, + contraction); + domain = isl_union_set_copy(kernel->expanded_domain); + domain = isl_union_set_universe(domain); for (i = 0; i < kernel->n_array; ++i) { struct gpu_local_array_info *local = &kernel->array[i]; isl_union_map *order; local->force_private = 0; - if (local->array->n_index != 0) + if (!gpu_array_can_be_private(local->array)) continue; order = isl_union_map_copy(local->array->dep_order); order = isl_union_map_intersect_domain(order, isl_union_set_copy(domain)); order = isl_union_map_intersect_range(order, isl_union_set_copy(domain)); - order = isl_union_map_intersect(order, - isl_union_map_copy(same_host_iteration)); + order = isl_union_map_eq_at_multi_union_pw_aff(order, + isl_multi_union_pw_aff_copy(prefix)); if (!isl_union_map_is_empty(order)) { local->force_private = 1; kernel->any_force_private = 1; @@ -1338,45 +1535,40 @@ isl_union_map_free(order); } - isl_union_map_free(same_host_iteration); + isl_multi_union_pw_aff_free(prefix); isl_union_set_free(domain); } -/* For each scalar in the input program, check if there are any - * order dependences active inside the current kernel, within - * the same iteration of the host schedule, i.e., the prefix - * schedule at "node". - * If so, mark the scalar as force_private so that it will be - * mapped to a register. +/* Expand the domain of the schedule "s" by plugging in + * the contraction "contraction" and return the result. */ -static void check_scalar_live_ranges(struct ppcg_kernel *kernel, - __isl_keep isl_schedule_node *node) +static __isl_give isl_union_map *expand(__isl_take isl_union_map *s, + __isl_keep isl_union_pw_multi_aff *contraction) { - isl_union_map *sched; - - if (!kernel->options->live_range_reordering) - return; - - sched = isl_schedule_node_get_prefix_schedule_union_map(node); - - check_scalar_live_ranges_in_host(kernel, sched); + contraction = isl_union_pw_multi_aff_copy(contraction); + s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction); + return s; } /* Create a set of dimension data->thread_depth + data->n_thread * that equates the residue of the final data->n_thread dimensions - * modulo the "sizes" to the thread identifiers. - * "space" is a parameter space containing the thread identifiers. + * modulo the kernel->block_dim sizes to the thread identifiers. * Store the computed set in data->privatization. + * + * The construction starts with the space of kernel->thread_filter, + * which is known to reference all thread identifiers. */ static void compute_privatization(struct gpu_group_data *data, - __isl_take isl_space *space, int *sizes) + struct ppcg_kernel *kernel) { int i; isl_ctx *ctx; + isl_space *space; isl_local_space *ls; isl_set *set; ctx = isl_union_map_get_ctx(data->shared_sched); + space = isl_union_set_get_space(kernel->thread_filter); space = isl_space_set_from_params(space); space = isl_space_add_dims(space, isl_dim_set, data->thread_depth + data->n_thread); @@ -1388,15 +1580,16 @@ isl_aff *aff, *aff2; isl_constraint *c; isl_val *v; - char name[20]; + isl_id *id; int pos; aff = isl_aff_var_on_domain(isl_local_space_copy(ls), isl_dim_set, data->thread_depth + i); - v = isl_val_int_from_si(ctx, sizes[i]); + v = isl_val_int_from_si(ctx, kernel->block_dim[i]); aff = isl_aff_mod_val(aff, v); - snprintf(name, sizeof(name), "t%d", i); - pos = isl_set_find_dim_by_name(set, isl_dim_param, name); + id = isl_id_list_get_id(kernel->thread_ids, i); + pos = isl_set_find_dim_by_id(set, isl_dim_param, id); + isl_id_free(id); aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls), isl_dim_param, pos); aff = isl_aff_sub(aff, aff2); @@ -1408,8 +1601,24 @@ data->privatization = set; } +/* Return the prefix schedule at "node" as a relation + * between domain elements and schedule dimensions after detecting + * equalities in this relation. + */ +static __isl_give isl_union_map *prefix_with_equalities( + __isl_keep isl_schedule_node *node) +{ + isl_union_map *schedule; + + schedule = isl_schedule_node_get_prefix_schedule_relation(node); + schedule = isl_union_map_detect_equalities(schedule); + + return schedule; +} + /* Group references of all arrays in "kernel". * "node" points to the kernel mark. + * The mapping to shared memory in computed at the "shared" mark. * * We first extract all required schedule information into * a gpu_group_data structure and then consider each array @@ -1420,10 +1629,10 @@ { int i; int r = 0; - isl_space *space; + isl_union_pw_multi_aff *contraction; struct gpu_group_data data; - check_scalar_live_ranges(kernel, node); + check_can_be_private_live_ranges(kernel, node); data.scop = kernel->prog->scop; @@ -1431,26 +1640,42 @@ data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node); node = isl_schedule_node_copy(node); - node = gpu_tree_move_down_to_thread(node, kernel->core); - data.shared_sched = - isl_schedule_node_get_prefix_schedule_relation(node); - data.shared_sched = isl_union_map_detect_equalities(data.shared_sched); + node = gpu_tree_move_down_to_shared(node, kernel->core); + data.shared_depth = isl_schedule_node_get_schedule_depth(node); + data.shared_sched = prefix_with_equalities(node); + node = gpu_tree_move_down_to_thread(node, kernel->core); node = isl_schedule_node_child(node, 0); data.thread_depth = isl_schedule_node_get_schedule_depth(node); data.n_thread = isl_schedule_node_band_n_member(node); - data.thread_sched = isl_union_map_copy(data.shared_sched); + if (data.thread_depth == data.shared_depth) + data.copy_sched = isl_union_map_copy(data.shared_sched); + else + data.copy_sched = prefix_with_equalities(node); + data.thread_sched = isl_union_map_copy(data.copy_sched); data.thread_sched = isl_union_map_flat_range_product(data.thread_sched, isl_schedule_node_band_get_partial_schedule_union_map(node)); data.thread_sched = isl_union_map_detect_equalities(data.thread_sched); + + contraction = isl_union_pw_multi_aff_copy(kernel->contraction); + data.host_sched = expand(data.host_sched, contraction); + data.shared_sched = expand(data.shared_sched, contraction); + if (data.thread_depth == data.shared_depth) { + isl_union_map_free(data.copy_sched); + data.copy_sched = isl_union_map_copy(data.shared_sched); + } else { + data.copy_sched = expand(data.copy_sched, contraction); + } + data.thread_sched = expand(data.thread_sched, contraction); + isl_union_pw_multi_aff_free(contraction); + node = isl_schedule_node_child(node, 0); data.full_sched = isl_union_map_copy(data.thread_sched); data.full_sched = isl_union_map_flat_range_product(data.full_sched, isl_schedule_node_get_subtree_schedule_union_map(node)); isl_schedule_node_free(node); - space = isl_union_set_get_space(kernel->thread_filter); - compute_privatization(&data, space, kernel->block_dim); + compute_privatization(&data, kernel); for (i = 0; i < kernel->n_array; ++i) { r = group_array_references(kernel, &kernel->array[i], &data); @@ -1460,6 +1685,7 @@ isl_union_map_free(data.host_sched); isl_union_map_free(data.shared_sched); + isl_union_map_free(data.copy_sched); isl_union_map_free(data.thread_sched); isl_union_map_free(data.full_sched); isl_set_free(data.privatization); @@ -1471,7 +1697,7 @@ * * { D -> A } * - * where D represents the first group->depth schedule dimensions + * where D represents the first tile->depth schedule dimensions * and A represents the array, construct an isl_multi_aff * * { [D[i] -> A[a]] -> A'[a'] } @@ -1542,7 +1768,7 @@ * * { [D[i] -> A[a]] -> T[t] } * - * where D represents the first group->depth schedule dimensions, + * where D represents the first tile->depth schedule dimensions, * A represents the global array and T represents the shared or * private memory tile. The name of T is the name of the local * array. @@ -1558,24 +1784,19 @@ void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group) { int i; - int dim; struct gpu_array_tile *tile; - struct gpu_array_info *array = group->array; isl_space *space; isl_multi_aff *tiling, *lb, *insert_array; isl_printer *p; char *local_name; - tile = group->private_tile; - if (!tile) - tile = group->shared_tile; + tile = gpu_array_ref_group_tile(group); if (!tile) return; space = isl_map_get_space(group->access); - dim = isl_space_dim(space, isl_dim_in); - space = isl_space_drop_dims(space, isl_dim_in, group->depth, - dim - group->depth); + space = isl_space_from_range(isl_space_range(space)); + space = isl_space_add_dims(space, isl_dim_in, tile->depth); insert_array = isl_multi_aff_domain_map(isl_space_copy(space)); for (i = 0; i < tile->n; ++i) Index: polly/trunk/lib/External/ppcg/gpu_hybrid.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu_hybrid.h +++ polly/trunk/lib/External/ppcg/gpu_hybrid.h @@ -0,0 +1,13 @@ +#ifndef GPU_HYBRID_H +#define GPU_HYBRID_H + +#include + +#include "gpu.h" +#include "hybrid.h" + +__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, + int *tile_sizes); + +#endif Index: polly/trunk/lib/External/ppcg/gpu_hybrid.c =================================================================== --- polly/trunk/lib/External/ppcg/gpu_hybrid.c +++ polly/trunk/lib/External/ppcg/gpu_hybrid.c @@ -0,0 +1,146 @@ +/* + * Copyright 2013 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include + +#include +#include +#include +#include + +#include "hybrid.h" +#include "gpu_hybrid.h" +#include "gpu_tree.h" +#include "schedule.h" +#include "util.h" + +/* Have all domain elements been filtered out before reaching + * the "node" position in the schedule tree? + */ +static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node) +{ + isl_union_set *domain; + isl_bool empty; + + domain = isl_schedule_node_get_domain(node); + empty = isl_union_set_is_empty(domain); + isl_union_set_free(domain); + + return empty; +} + +/* Given a pointer to a phase in the result of hybrid tiling, + * map the phase to the device, provided the phase is non-empty. + * Empty phases can occur if the input schedule domain can be + * covered by a small number of hexagons that all belong to the same phase. + * + * The input has the following form: + * + * M - CT - P - C - ... + * + * with M the phase marker, CT the space tiling, P the original + * parent band and C the original child band. + * The (outer dimensions of the) C band need to be mapped to threads. + * The (outer dimension of the) CT band needs to be mapped to blocks. + * The mapping to shared memory needs to be computed between the CT and + * the P band. + * + * The C band is first shifted to start at zero. + * Then the appropriate markers are introduced and a kernel is + * created for the tree rooted at CT. + * If the "unroll_gpu_tile" option is set, then the AST generator + * is instructed to unroll the P and C bands. + */ +static __isl_give isl_schedule_node *update_phase( + __isl_take isl_schedule_node *node, void *user) +{ + struct gpu_gen *gen = user; + int depth0, depth; + isl_ctx *ctx; + isl_id *id; + isl_bool empty_domain; + ppcg_ht_phase *phase; + + empty_domain = has_empty_domain(node); + if (empty_domain < 0) + return isl_schedule_node_free(node); + if (empty_domain) + return node; + + if (!node) + return NULL; + ctx = isl_schedule_node_get_ctx(node); + + phase = ppcg_ht_phase_extract_from_mark(node); + + depth0 = isl_schedule_node_get_tree_depth(node); + + node = isl_schedule_node_child(node, 0); + + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = ppcg_ht_phase_shift_space_point(phase, node); + if (gen->options->unroll_gpu_tile) + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); + id = isl_id_alloc(ctx, "thread", NULL); + node = isl_schedule_node_insert_mark(node, id); + node = isl_schedule_node_parent(node); + if (gen->options->unroll_gpu_tile) + node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); + id = isl_id_alloc(ctx, "shared", NULL); + node = isl_schedule_node_insert_mark(node, id); + node = isl_schedule_node_parent(node); + + node = gpu_create_kernel(gen, node, 0, NULL); + + depth = isl_schedule_node_get_tree_depth(node); + node = isl_schedule_node_ancestor(node, depth - depth0); + + return node; +} + +/* Apply hybrid tiling on "node" and its parent based on the (valid) + * bounds on the relative dependence distances "bounds" and + * the tile sizes in "tile_sizes". + * The number of elements in "tile_sizes" is at least as large + * as the sum of the dimensions of the parent and the child node. + * + * Convert the tile_sizes to an isl_multi_val in the right space, + * insert the hybrid tiling and then create a kernel inside each phase. + * Finally, remove the phase marks. + */ +__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, + __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, + int *tile_sizes) +{ + isl_multi_val *mv; + isl_space *space, *space2; + + if (!node || !bounds) + goto error; + + space2 = isl_schedule_node_band_get_space(node); + node = isl_schedule_node_parent(node); + space = isl_schedule_node_band_get_space(node); + space = isl_space_product(space, space2); + mv = ppcg_multi_val_from_int_list(space, tile_sizes); + + node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options); + + node = hybrid_tile_foreach_phase(node, &update_phase, gen); + + node = hybrid_tile_drop_phase_marks(node); + + return node; +error: + isl_schedule_node_free(node); + ppcg_ht_bounds_free(bounds); + return NULL; +} Index: polly/trunk/lib/External/ppcg/gpu_print.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu_print.h +++ polly/trunk/lib/External/ppcg/gpu_print.h @@ -9,6 +9,9 @@ __isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p, struct gpu_types *types, struct gpu_prog *prog); +__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node); + __isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn, struct gpu_array_info *array); __isl_give isl_printer *gpu_array_info_print_declaration_argument( Index: polly/trunk/lib/External/ppcg/gpu_print.c =================================================================== --- polly/trunk/lib/External/ppcg/gpu_print.c +++ polly/trunk/lib/External/ppcg/gpu_print.c @@ -22,19 +22,19 @@ struct gpu_prog *prog) { int i; - isl_ast_build *build; if (!prog) return isl_printer_free(p); - build = isl_ast_build_from_context(isl_set_copy(prog->scop->context)); for (i = 0; i < prog->n_array; ++i) { - if (!prog->array[i].declare_local) + struct gpu_array_info *array = &prog->array[i]; + isl_ast_expr *size; + + if (!array->declare_local) continue; - p = ppcg_print_declaration(p, prog->scop->pet->arrays[i], - build); + size = array->declared_size; + p = ppcg_print_declaration_with_size(p, array->type, size); } - isl_ast_build_free(build); return p; } @@ -47,8 +47,12 @@ int i; for (i = 0; i < array->n_index; ++i) { + isl_ast_expr *bound; + prn = isl_printer_print_str(prn, "("); - prn = isl_printer_print_pw_aff(prn, array->bound[i]); + bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i); + prn = isl_printer_print_ast_expr(prn, bound); + isl_ast_expr_free(bound); prn = isl_printer_print_str(prn, ") * "); } prn = isl_printer_print_str(prn, "sizeof("); @@ -63,18 +67,10 @@ static __isl_give isl_printer *print_non_linearized_declaration_argument( __isl_take isl_printer *p, struct gpu_array_info *array) { - int i; - p = isl_printer_print_str(p, array->type); p = isl_printer_print_str(p, " "); - p = isl_printer_print_str(p, array->name); - - for (i = 0; i < array->n_index; i++) { - p = isl_printer_print_str(p, "["); - p = isl_printer_print_pw_aff(p, array->bound[i]); - p = isl_printer_print_str(p, "]"); - } + p = isl_printer_print_ast_expr(p, array->bound_expr); return p; } @@ -136,16 +132,11 @@ /* Print an access to the element in the global memory copy * described by "stmt". The index of the copy is recorded in * stmt->index as an access to the array. - * - * The copy in global memory has been linearized, so we need to take - * the array size into account. */ static __isl_give isl_printer *stmt_print_global_index( __isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt) { - int i; struct gpu_array_info *array = stmt->u.c.array; - struct gpu_local_array_info *local = stmt->u.c.local_array; isl_ast_expr *index; if (gpu_array_is_scalar(array)) { @@ -156,8 +147,6 @@ } index = isl_ast_expr_copy(stmt->u.c.index); - if (array->linearize) - index = gpu_local_array_info_linearize_index(local, index); p = isl_printer_print_ast_expr(p, index); isl_ast_expr_free(index); @@ -200,6 +189,72 @@ return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr); } +/* This function is called for each node in a GPU AST. + * In case of a user node, print the macro definitions required + * for printing the AST expressions in the annotation, if any. + * For other nodes, return true such that descendants are also + * visited. + * + * In particular, for a kernel launch, print the macro definitions + * needed for the grid size. + * For a copy statement, print the macro definitions needed + * for the two index expressions. + * For an original user statement, print the macro definitions + * needed for the substitutions. + */ +static isl_bool at_node(__isl_keep isl_ast_node *node, void *user) +{ + const char *name; + isl_id *id; + int is_kernel; + struct ppcg_kernel *kernel; + struct ppcg_kernel_stmt *stmt; + isl_printer **p = user; + + if (isl_ast_node_get_type(node) != isl_ast_node_user) + return isl_bool_true; + + id = isl_ast_node_get_annotation(node); + if (!id) + return isl_bool_false; + + name = isl_id_get_name(id); + if (!name) + return isl_bool_error; + is_kernel = !strcmp(name, "kernel"); + kernel = is_kernel ? isl_id_get_user(id) : NULL; + stmt = is_kernel ? NULL : isl_id_get_user(id); + isl_id_free(id); + + if ((is_kernel && !kernel) || (!is_kernel && !stmt)) + return isl_bool_error; + + if (is_kernel) { + *p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p); + } else if (stmt->type == ppcg_kernel_copy) { + *p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p); + *p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p); + } else if (stmt->type == ppcg_kernel_domain) { + *p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr); + } + if (!*p) + return isl_bool_error; + + return isl_bool_false; +} + +/* Print the required macros for the GPU AST "node" to "p", + * including those needed for the user statements inside the AST. + */ +__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node) +{ + if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0) + return isl_printer_free(p); + p = ppcg_print_macros(p, node); + return p; +} + /* Was the definition of "type" printed before? * That is, does its name appear in the list of printed types "types"? */ Index: polly/trunk/lib/External/ppcg/gpu_tree.h =================================================================== --- polly/trunk/lib/External/ppcg/gpu_tree.h +++ polly/trunk/lib/External/ppcg/gpu_tree.h @@ -5,7 +5,11 @@ #include "gpu.h" +__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread( + __isl_take isl_schedule_node *node); int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node); +__isl_give isl_schedule_node *gpu_tree_move_down_to_shared( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core); __isl_give isl_schedule_node *gpu_tree_move_up_to_thread( __isl_take isl_schedule_node *node); __isl_give isl_schedule_node *gpu_tree_move_down_to_thread( Index: polly/trunk/lib/External/ppcg/gpu_tree.c =================================================================== --- polly/trunk/lib/External/ppcg/gpu_tree.c +++ polly/trunk/lib/External/ppcg/gpu_tree.c @@ -63,6 +63,13 @@ return is_marked(node, "kernel"); } +/* Is "node" a mark node with an identifier called "shared"? + */ +static int node_is_shared(__isl_keep isl_schedule_node *node) +{ + return is_marked(node, "shared"); +} + /* Is "node" a mark node with an identifier called "thread"? */ static int node_is_thread(__isl_keep isl_schedule_node *node) @@ -70,6 +77,77 @@ return is_marked(node, "thread"); } +/* Insert a mark node with identifier "shared" in front of "node". + */ +static __isl_give isl_schedule_node *insert_shared( + __isl_take isl_schedule_node *node) +{ + isl_ctx *ctx; + isl_id *id; + + ctx = isl_schedule_node_get_ctx(node); + id = isl_id_alloc(ctx, "shared", NULL); + node = isl_schedule_node_insert_mark(node, id); + + return node; +} + +/* Insert a "shared" mark in front of the "thread" mark + * provided the linear branch between "node" and the "thread" mark + * does not contain such a "shared" mark already. + * + * As a side effect, this function checks that the subtree at "node" + * actually contains a "thread" mark and that there is no branching + * in between "node" and this "thread" mark. + */ +__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread( + __isl_take isl_schedule_node *node) +{ + int depth0, depth; + int any_shared = 0; + + if (!node) + return NULL; + + depth0 = isl_schedule_node_get_tree_depth(node); + + for (;;) { + int is_thread; + int n; + + if (!any_shared) { + any_shared = node_is_shared(node); + if (any_shared < 0) + return isl_schedule_node_free(node); + } + is_thread = node_is_thread(node); + if (is_thread < 0) + return isl_schedule_node_free(node); + if (is_thread) + break; + n = isl_schedule_node_n_children(node); + if (n == 0) + isl_die(isl_schedule_node_get_ctx(node), + isl_error_invalid, + "no thread marker found", + return isl_schedule_node_free(node)); + if (n > 1) + isl_die(isl_schedule_node_get_ctx(node), + isl_error_invalid, + "expecting single thread marker", + return isl_schedule_node_free(node)); + + node = isl_schedule_node_child(node, 0); + } + + if (!any_shared) + node = insert_shared(node); + depth = isl_schedule_node_get_tree_depth(node); + node = isl_schedule_node_ancestor(node, depth - depth0); + + return node; +} + /* Assuming "node" is a filter node, does it correspond to the branch * that contains the "thread" mark, i.e., does it contain any elements * in "core"? @@ -127,6 +205,23 @@ } /* Move down the branch between "kernel" and "thread" until + * the "shared" mark is reached, where the branch containing the "shared" + * mark is identified by the domain elements in "core". + */ +__isl_give isl_schedule_node *gpu_tree_move_down_to_shared( + __isl_take isl_schedule_node *node, __isl_keep isl_union_set *core) +{ + int is_shared; + + while ((is_shared = node_is_shared(node)) == 0) + node = core_child(node, core); + if (is_shared < 0) + node = isl_schedule_node_free(node); + + return node; +} + +/* Move down the branch between "kernel" and "thread" until * the "thread" mark is reached, where the branch containing the "thread" * mark is identified by the domain elements in "core". */ @@ -189,7 +284,8 @@ __isl_take isl_schedule_node *node, int depth, __isl_keep isl_union_set *core) { - int is_thread; + int is_shared; + int is_thread = 0; while (node && isl_schedule_node_get_schedule_depth(node) < depth) { if (isl_schedule_node_get_type(node) == @@ -203,10 +299,11 @@ } node = core_child(node, core); } - while ((is_thread = node_is_thread(node)) == 0 && + while ((is_shared = node_is_shared(node)) == 0 && + (is_thread = node_is_thread(node)) == 0 && isl_schedule_node_get_type(node) != isl_schedule_node_band) node = core_child(node, core); - if (is_thread < 0) + if (is_shared < 0 || is_thread < 0) node = isl_schedule_node_free(node); return node; Index: polly/trunk/lib/External/ppcg/grouping.c =================================================================== --- polly/trunk/lib/External/ppcg/grouping.c +++ polly/trunk/lib/External/ppcg/grouping.c @@ -0,0 +1,684 @@ +/* + * Copyright 2016 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ppcg.h" + +/* Internal data structure for use during the detection of statements + * that can be grouped. + * + * "sc" contains the original schedule constraints (not a copy). + * "dep" contains the intersection of the validity and the proximity + * constraints in "sc". It may be NULL if it has not been computed yet. + * "group_id" is the identifier for the next group that is extracted. + * + * "domain" is the set of statement instances that belong to any of the groups. + * "contraction" maps the elements of "domain" to the corresponding group + * instances. + * "schedule" schedules the statements in each group relatively to each other. + * These last three fields are NULL if no groups have been found so far. + */ +struct ppcg_grouping { + isl_schedule_constraints *sc; + + isl_union_map *dep; + int group_id; + + isl_union_set *domain; + isl_union_pw_multi_aff *contraction; + isl_schedule *schedule; +}; + +/* Clear all memory allocated by "grouping". + */ +static void ppcg_grouping_clear(struct ppcg_grouping *grouping) +{ + isl_union_map_free(grouping->dep); + isl_union_set_free(grouping->domain); + isl_union_pw_multi_aff_free(grouping->contraction); + isl_schedule_free(grouping->schedule); +} + +/* Compute the intersection of the proximity and validity dependences + * in grouping->sc and store the result in grouping->dep, unless + * this intersection has been computed before. + */ +static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping) +{ + isl_union_map *validity, *proximity; + + if (grouping->dep) + return isl_stat_ok; + + validity = isl_schedule_constraints_get_validity(grouping->sc); + proximity = isl_schedule_constraints_get_proximity(grouping->sc); + grouping->dep = isl_union_map_intersect(validity, proximity); + + if (!grouping->dep) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Information extracted from one or more consecutive leaves + * in the input schedule. + * + * "list" contains the sets of statement instances in the leaves, + * one element in the list for each original leaf. + * "domain" contains the union of the sets in "list". + * "prefix" contains the prefix schedule of these elements. + */ +struct ppcg_grouping_leaf { + isl_union_set *domain; + isl_union_set_list *list; + isl_multi_union_pw_aff *prefix; +}; + +/* Free all memory allocated for "leaves". + */ +static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[n]) +{ + int i; + + if (!leaves) + return; + + for (i = 0; i < n; ++i) { + isl_union_set_free(leaves[i].domain); + isl_union_set_list_free(leaves[i].list); + isl_multi_union_pw_aff_free(leaves[i].prefix); + } + + free(leaves); +} + +/* Short-hand for retrieving the prefix schedule at "node" + * in the form of an isl_multi_union_pw_aff. + */ +static __isl_give isl_multi_union_pw_aff *get_prefix( + __isl_keep isl_schedule_node *node) +{ + return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); +} + +/* Return an array of "n" elements with information extracted from + * the "n" children of "node" starting at "first", all of which + * are known to be filtered leaves. + */ +struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node, + int first, int n) +{ + int i; + isl_ctx *ctx; + struct ppcg_grouping_leaf *leaves; + + if (!node) + return NULL; + + ctx = isl_schedule_node_get_ctx(node); + leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n); + if (!leaves) + return NULL; + + for (i = 0; i < n; ++i) { + isl_schedule_node *child; + isl_union_set *domain; + + child = isl_schedule_node_get_child(node, first + i); + child = isl_schedule_node_child(child, 0); + domain = isl_schedule_node_get_domain(child); + leaves[i].domain = isl_union_set_copy(domain); + leaves[i].list = isl_union_set_list_from_union_set(domain); + leaves[i].prefix = get_prefix(child); + isl_schedule_node_free(child); + } + + return leaves; +} + +/* Internal data structure used by merge_leaves. + * + * "src" and "dst" point to the two consecutive leaves that are + * under investigation for being merged. + * "merge" is initially set to 0 and is set to 1 as soon as + * it turns out that it is useful to merge the two leaves. + */ +struct ppcg_merge_leaves_data { + int merge; + struct ppcg_grouping_leaf *src; + struct ppcg_grouping_leaf *dst; +}; + +/* Given a relation "map" between instances of two statements A and B, + * does it relate every instance of A (according to the domain of "src") + * to every instance of B (according to the domain of "dst")? + */ +static isl_bool covers_src_and_dst(__isl_keep isl_map *map, + struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst) +{ + isl_space *space; + isl_set *set1, *set2; + isl_bool is_subset; + + space = isl_space_domain(isl_map_get_space(map)); + set1 = isl_union_set_extract_set(src->domain, space); + set2 = isl_map_domain(isl_map_copy(map)); + is_subset = isl_set_is_subset(set1, set2); + isl_set_free(set1); + isl_set_free(set2); + if (is_subset < 0 || !is_subset) + return is_subset; + + space = isl_space_range(isl_map_get_space(map)); + set1 = isl_union_set_extract_set(dst->domain, space); + set2 = isl_map_range(isl_map_copy(map)); + is_subset = isl_set_is_subset(set1, set2); + isl_set_free(set1); + isl_set_free(set2); + + return is_subset; +} + +/* Given a relation "map" between instances of two statements A and B, + * are pairs of related instances executed together in the input schedule? + * That is, is each pair of instances assigned the same value + * by the corresponding prefix schedules? + * + * In particular, select the subset of "map" that has pairs of elements + * with the same value for the prefix schedules and then check + * if "map" is still a subset of the result. + */ +static isl_bool matches_prefix(__isl_keep isl_map *map, + struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst) +{ + isl_union_map *umap, *equal; + isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix; + isl_bool is_subset; + + src_prefix = isl_multi_union_pw_aff_copy(src->prefix); + dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix); + prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix); + + umap = isl_union_map_from_map(isl_map_copy(map)); + equal = isl_union_map_copy(umap); + equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix); + + is_subset = isl_union_map_is_subset(umap, equal); + + isl_union_map_free(umap); + isl_union_map_free(equal); + + return is_subset; +} + +/* Given a set of validity and proximity schedule constraints "map" + * between statements in consecutive leaves in a valid schedule, + * should the two leaves be merged into one? + * + * In particular, the two are merged if the constraints form + * a bijection between every instance of the first statement and + * every instance of the second statement. Moreover, each + * pair of such dependent instances needs to be executed consecutively + * in the input schedule. That is, they need to be assigned + * the same value by their prefix schedules. + * + * What this means is that for each instance of the first statement + * there is exactly one instance of the second statement that + * is executed immediately after the instance of the first statement and + * that, moreover, both depends on this statement instance and + * should be brought as close as possible to this statement instance. + * In other words, it is both possible to execute the two instances + * together (according to the input schedule) and desirable to do so + * (according to the validity and proximity schedule constraints). + */ +static isl_stat check_merge(__isl_take isl_map *map, void *user) +{ + struct ppcg_merge_leaves_data *data = user; + isl_bool ok; + + ok = covers_src_and_dst(map, data->src, data->dst); + if (ok >= 0 && ok) + ok = isl_map_is_bijective(map); + if (ok >= 0 && ok) + ok = matches_prefix(map, data->src, data->dst); + + isl_map_free(map); + + if (ok < 0) + return isl_stat_error; + if (!ok) + return isl_stat_ok; + + data->merge = 1; + return isl_stat_error; +} + +/* Merge the leaves at position "pos" and "pos + 1" in "leaves". + */ +static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[n], int pos) +{ + int i; + + leaves[pos].domain = isl_union_set_union(leaves[pos].domain, + leaves[pos + 1].domain); + leaves[pos].list = isl_union_set_list_concat(leaves[pos].list, + leaves[pos + 1].list); + leaves[pos].prefix = isl_multi_union_pw_aff_union_add( + leaves[pos].prefix, leaves[pos + 1].prefix); + for (i = pos + 1; i + 1 < n; ++i) + leaves[i] = leaves[i + 1]; + leaves[n - 1].domain = NULL; + leaves[n - 1].list = NULL; + leaves[n - 1].prefix = NULL; + + if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Merge pairs of consecutive leaves in "leaves" taking into account + * the intersection of validity and proximity schedule constraints "dep". + * + * If a leaf has been merged with the next leaf, then the combination + * is checked again for merging with the next leaf. + * That is, if the leaves are A, B and C, then B may not have been + * merged with C, but after merging A and B, it could still be useful + * to merge the combination AB with C. + * + * Two leaves A and B are merged if there are instances of at least + * one pair of statements, one statement in A and one B, such that + * the validity and proximity schedule constraints between them + * make them suitable for merging according to check_merge. + * + * Return the final number of leaves in the sequence, or -1 on error. + */ +static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[n], + __isl_keep isl_union_map *dep) +{ + int i; + struct ppcg_merge_leaves_data data; + + for (i = n - 1; i >= 0; --i) { + isl_union_map *dep_i; + isl_stat ok; + + if (i + 1 >= n) + continue; + + dep_i = isl_union_map_copy(dep); + dep_i = isl_union_map_intersect_domain(dep_i, + isl_union_set_copy(leaves[i].domain)); + dep_i = isl_union_map_intersect_range(dep_i, + isl_union_set_copy(leaves[i + 1].domain)); + data.merge = 0; + data.src = &leaves[i]; + data.dst = &leaves[i + 1]; + ok = isl_union_map_foreach_map(dep_i, &check_merge, &data); + isl_union_map_free(dep_i); + if (ok < 0 && !data.merge) + return -1; + if (!data.merge) + continue; + if (merge_pair(n, leaves, i) < 0) + return -1; + --n; + ++i; + } + + return n; +} + +/* Construct a schedule with "domain" as domain, that executes + * the elements of "list" in order (as a sequence). + */ +static __isl_give isl_schedule *schedule_from_domain_and_list( + __isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list) +{ + isl_schedule *schedule; + isl_schedule_node *node; + + schedule = isl_schedule_from_domain(isl_union_set_copy(domain)); + node = isl_schedule_get_root(schedule); + isl_schedule_free(schedule); + node = isl_schedule_node_child(node, 0); + list = isl_union_set_list_copy(list); + node = isl_schedule_node_insert_sequence(node, list); + schedule = isl_schedule_node_get_schedule(node); + isl_schedule_node_free(node); + + return schedule; +} + +/* Construct a unique identifier for a group in "grouping". + * + * The name is of the form G_n, with n the first value starting at + * grouping->group_id that does not result in an identifier + * that is already in use in the domain of the original schedule + * constraints. + */ +static isl_id *construct_group_id(struct ppcg_grouping *grouping, + __isl_take isl_space *space) +{ + isl_ctx *ctx; + isl_id *id; + isl_bool empty; + isl_union_set *domain; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + domain = isl_schedule_constraints_get_domain(grouping->sc); + + do { + char buffer[20]; + isl_id *id; + isl_set *set; + + snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id); + grouping->group_id++; + id = isl_id_alloc(ctx, buffer, NULL); + space = isl_space_set_tuple_id(space, isl_dim_set, id); + set = isl_union_set_extract_set(domain, isl_space_copy(space)); + empty = isl_set_plain_is_empty(set); + isl_set_free(set); + } while (empty >= 0 && !empty); + + if (empty < 0) + space = isl_space_free(space); + + id = isl_space_get_tuple_id(space, isl_dim_set); + + isl_space_free(space); + isl_union_set_free(domain); + + return id; +} + +/* Construct a contraction from "prefix" and "domain" for a new group + * in "grouping". + * + * The values of the prefix schedule "prefix" are used as instances + * of the new group. The identifier of the group is constructed + * in such a way that it does not conflict with those of earlier + * groups nor with statements in the domain of the original + * schedule constraints. + * The isl_multi_union_pw_aff "prefix" then simply needs to be + * converted to an isl_union_pw_multi_aff. However, this is not + * possible if "prefix" is zero-dimensional, so in this case, + * a contraction is constructed from "domain" instead. + */ +static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain( + struct ppcg_grouping *grouping, + __isl_keep isl_multi_union_pw_aff *prefix, + __isl_keep isl_union_set *domain) +{ + isl_id *id; + isl_space *space; + int dim; + + space = isl_multi_union_pw_aff_get_space(prefix); + if (!space) + return NULL; + dim = isl_space_dim(space, isl_dim_set); + id = construct_group_id(grouping, space); + if (dim == 0) { + isl_multi_val *mv; + + space = isl_multi_union_pw_aff_get_space(prefix); + space = isl_space_set_tuple_id(space, isl_dim_set, id); + mv = isl_multi_val_zero(space); + domain = isl_union_set_copy(domain); + return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv); + } + prefix = isl_multi_union_pw_aff_copy(prefix); + prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id); + return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix); +} + +/* Extend "grouping" with groups corresponding to merged + * leaves in the list of potentially merged leaves "leaves". + * + * The "list" field of each element in "leaves" contains a list + * of the instances sets of the original leaves that have been + * merged into this element. If at least two of the original leaves + * have been merged into a given element, then add the corresponding + * group to "grouping". + * In particular, the domain is extended with the statement instances + * of the merged leaves, the contraction is extended with a mapping + * of these statement instances to instances of a new group and + * the schedule is extended with a schedule that executes + * the statement instances according to the order of the leaves + * in which they appear. + * Since the instances of the groups should already be scheduled apart + * in the schedule into which this schedule will be plugged in, + * the schedules of the individual groups are combined independently + * of each other (as a set). + */ +static isl_stat add_groups(struct ppcg_grouping *grouping, + int n, struct ppcg_grouping_leaf leaves[n]) +{ + int i; + + for (i = 0; i < n; ++i) { + int n_leaf; + isl_schedule *schedule; + isl_union_set *domain; + isl_union_pw_multi_aff *upma; + + n_leaf = isl_union_set_list_n_union_set(leaves[i].list); + if (n_leaf < 0) + return isl_stat_error; + if (n_leaf <= 1) + continue; + schedule = schedule_from_domain_and_list(leaves[i].domain, + leaves[i].list); + upma = group_contraction_from_prefix_and_domain(grouping, + leaves[i].prefix, leaves[i].domain); + + domain = isl_union_set_copy(leaves[i].domain); + if (grouping->domain) { + domain = isl_union_set_union(domain, grouping->domain); + upma = isl_union_pw_multi_aff_union_add(upma, + grouping->contraction); + schedule = isl_schedule_set(schedule, + grouping->schedule); + } + grouping->domain = domain; + grouping->contraction = upma; + grouping->schedule = schedule; + + if (!grouping->domain || !grouping->contraction || + !grouping->schedule) + return isl_stat_error; + } + + return isl_stat_ok; +} + +/* Look for any pairs of consecutive leaves among the "n" children of "node" + * starting at "first" that should be merged together. + * Store the results in "grouping". + * + * First make sure the intersection of validity and proximity + * schedule constraints is available and extract the required + * information from the "n" leaves. + * Then try and merge consecutive leaves based on the validity + * and proximity constraints. + * If any pairs were successfully merged, then add groups + * corresponding to the merged leaves to "grouping". + */ +static isl_stat group_subsequence(__isl_keep isl_schedule_node *node, + int first, int n, struct ppcg_grouping *grouping) +{ + int n_merge; + struct ppcg_grouping_leaf *leaves; + + if (ppcg_grouping_compute_dep(grouping) < 0) + return isl_stat_error; + + leaves = extract_leaves(node, first, n); + if (!leaves) + return isl_stat_error; + + n_merge = merge_leaves(n, leaves, grouping->dep); + if (n_merge >= 0 && n_merge < n && + add_groups(grouping, n_merge, leaves) < 0) + return isl_stat_error; + + ppcg_grouping_leaf_free(n, leaves); + + return isl_stat_ok; +} + +/* If "node" is a sequence, then check if it has any consecutive + * leaves that should be merged together and store the results + * in "grouping". + * + * In particular, call group_subsequence on each consecutive + * sequence of (filtered) leaves among the children of "node". + */ +static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user) +{ + int i, n, first; + struct ppcg_grouping *grouping = user; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence) + return isl_bool_true; + + n = isl_schedule_node_n_children(node); + if (n < 0) + return isl_bool_error; + + first = -1; + for (i = 0; i < n; ++i) { + isl_schedule_node *child; + enum isl_schedule_node_type type; + + child = isl_schedule_node_get_child(node, i); + child = isl_schedule_node_child(child, 0); + type = isl_schedule_node_get_type(child); + isl_schedule_node_free(child); + + if (first >= 0 && type != isl_schedule_node_leaf) { + if (group_subsequence(node, first, i - first, + grouping) < 0) + return isl_bool_error; + first = -1; + } + if (first < 0 && type == isl_schedule_node_leaf) + first = i; + } + if (first >= 0) { + if (group_subsequence(node, first, n - first, grouping) < 0) + return isl_bool_error; + } + + return isl_bool_true; +} + +/* Complete "grouping" to cover all statement instances in the domain + * of grouping->sc. + * + * In particular, grouping->domain is set to the full set of statement + * instances; group->contraction is extended with an identity + * contraction on the additional instances and group->schedule + * is extended with an independent schedule on those additional instances. + * In the extension of group->contraction, the additional instances + * are split into those belong to different statements and those + * that belong to some of the same statements. The first group + * is replaced by its universe in order to simplify the contraction extension. + */ +static void complete_grouping(struct ppcg_grouping *grouping) +{ + isl_union_set *domain, *left, *overlap; + isl_union_pw_multi_aff *upma; + isl_schedule *schedule; + + domain = isl_schedule_constraints_get_domain(grouping->sc); + left = isl_union_set_subtract(isl_union_set_copy(domain), + isl_union_set_copy(grouping->domain)); + schedule = isl_schedule_from_domain(isl_union_set_copy(left)); + schedule = isl_schedule_set(schedule, grouping->schedule); + grouping->schedule = schedule; + + overlap = isl_union_set_universe(grouping->domain); + grouping->domain = domain; + overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap); + left = isl_union_set_subtract(left, isl_union_set_copy(overlap)); + left = isl_union_set_universe(left); + left = isl_union_set_union(left, overlap); + upma = isl_union_set_identity_union_pw_multi_aff(left); + upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction); + grouping->contraction = upma; +} + +/* Compute a schedule on the domain of "sc" that respects the schedule + * constraints in "sc". + * + * "schedule" is a known correct schedule that is used to combine + * groups of statements if options->group_chains is set. + * In particular, statements that are executed consecutively in a sequence + * in this schedule and where all instances of the second depend on + * the instance of the first that is executed in the same iteration + * of outer band nodes are grouped together into a single statement. + * The schedule constraints are then mapped to these groups of statements + * and the resulting schedule is expanded again to refer to the original + * statements. + */ +__isl_give isl_schedule *ppcg_compute_schedule( + __isl_take isl_schedule_constraints *sc, + __isl_keep isl_schedule *schedule, struct ppcg_options *options) +{ + struct ppcg_grouping grouping = { sc }; + isl_union_pw_multi_aff *contraction; + isl_union_map *umap; + isl_schedule *res, *expansion; + + if (!options->group_chains) + return isl_schedule_constraints_compute_schedule(sc); + + grouping.group_id = 0; + if (isl_schedule_foreach_schedule_node_top_down(schedule, + &detect_groups, &grouping) < 0) + goto error; + if (!grouping.contraction) { + ppcg_grouping_clear(&grouping); + return isl_schedule_constraints_compute_schedule(sc); + } + complete_grouping(&grouping); + contraction = isl_union_pw_multi_aff_copy(grouping.contraction); + umap = isl_union_map_from_union_pw_multi_aff(contraction); + + sc = isl_schedule_constraints_apply(sc, umap); + + res = isl_schedule_constraints_compute_schedule(sc); + + contraction = isl_union_pw_multi_aff_copy(grouping.contraction); + expansion = isl_schedule_copy(grouping.schedule); + res = isl_schedule_expand(res, contraction, expansion); + + ppcg_grouping_clear(&grouping); + return res; +error: + ppcg_grouping_clear(&grouping); + isl_schedule_constraints_free(sc); + return NULL; +} Index: polly/trunk/lib/External/ppcg/hybrid.h =================================================================== --- polly/trunk/lib/External/ppcg/hybrid.h +++ polly/trunk/lib/External/ppcg/hybrid.h @@ -0,0 +1,41 @@ +#ifndef HYBRID_H +#define HYBRID_H + +#include +#include + +#include "ppcg.h" + +struct ppcg_ht_bounds; +typedef struct ppcg_ht_bounds ppcg_ht_bounds; + +struct ppcg_ht_phase; +typedef struct ppcg_ht_phase ppcg_ht_phase; + +isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node); +isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node); + +__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop, + __isl_keep isl_schedule_node *node); +void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds); +isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds); +isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds, + __isl_keep isl_multi_val *sizes); +__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling( + __isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes, + __isl_take isl_schedule_node *node, struct ppcg_options *options); +__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free( + __isl_take ppcg_ht_bounds *bounds); + +__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark( + __isl_keep isl_schedule_node *node); +__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node); +__isl_give isl_schedule_node *hybrid_tile_foreach_phase( + __isl_take isl_schedule_node *node, + __isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node, + void *user), void *user); +__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks( + __isl_take isl_schedule_node *node); + +#endif Index: polly/trunk/lib/External/ppcg/hybrid.c =================================================================== --- polly/trunk/lib/External/ppcg/hybrid.c +++ polly/trunk/lib/External/ppcg/hybrid.c @@ -0,0 +1,2242 @@ +/* + * Copyright 2013 Ecole Normale Superieure + * Copyright 2015 Sven Verdoolaege + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hybrid.h" +#include "schedule.h" + +/* The hybrid tiling implemented in this file is based on + * Grosser et al., "Hybrid Hexagonal/Classical Tiling for GPUs". + */ + +/* Bounds on relative dependence distances in input to hybrid tiling. + * upper is an upper bound on the relative dependence distances + * in the first space dimension + * -lower is a lower bound on the relative dependence distances + * in all space dimensions. + * + * In particular, + * + * d_i >= -lower_i d_0 + * and + * d_1 <= upper d_0 + * + * for each dependence distance vector d, where d_1 is the component + * corresponding to the first space dimension. + * + * upper and lower are always non-negative. + * Some of the values may be NaN if no bound could be found. + */ +struct ppcg_ht_bounds { + isl_val *upper; + isl_multi_val *lower; +}; + +/* Free "bounds" along with all its fields. + */ +__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free( + __isl_take ppcg_ht_bounds *bounds) +{ + if (!bounds) + return NULL; + isl_val_free(bounds->upper); + isl_multi_val_free(bounds->lower); + free(bounds); + + return NULL; +} + +/* Create a ppcg_ht_bounds object for a band living in "space". + * The bounds are initialized to NaN. + */ +__isl_give ppcg_ht_bounds *ppcg_ht_bounds_alloc(__isl_take isl_space *space) +{ + int i, n; + isl_ctx *ctx; + ppcg_ht_bounds *bounds; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + bounds = isl_alloc_type(ctx, struct ppcg_ht_bounds); + if (!bounds) + goto error; + bounds->upper = isl_val_nan(ctx); + bounds->lower = isl_multi_val_zero(space); + n = isl_multi_val_dim(bounds->lower, isl_dim_set); + for (i = 0; i < n; ++i) { + isl_val *v = isl_val_copy(bounds->upper); + bounds->lower = isl_multi_val_set_val(bounds->lower, i, v); + } + + if (!bounds->lower || !bounds->upper) + return ppcg_ht_bounds_free(bounds); + + return bounds; +error: + isl_space_free(space); + return NULL; +} + +void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds) +{ + if (!bounds) + return; + + fprintf(stderr, "lower: "); + isl_multi_val_dump(bounds->lower); + fprintf(stderr, "upper: "); + isl_val_dump(bounds->upper); +} + +/* Return the upper bound on the relative dependence distances + * in the first space dimension. + */ +__isl_give isl_val *ppcg_ht_bounds_get_upper(__isl_keep ppcg_ht_bounds *bounds) +{ + if (!bounds) + return NULL; + return isl_val_copy(bounds->upper); +} + +/* Replace the upper bound on the relative dependence distances + * in the first space dimension by "upper". + */ +__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_upper( + __isl_take ppcg_ht_bounds *bounds, __isl_take isl_val *upper) +{ + if (!bounds || !upper) + goto error; + isl_val_free(bounds->upper); + bounds->upper = upper; + return bounds; +error: + ppcg_ht_bounds_free(bounds); + isl_val_free(upper); + return NULL; +} + +/* Return the lower bound on the relative dependence distances + * in space dimension "pos". + */ +__isl_give isl_val *ppcg_ht_bounds_get_lower(__isl_keep ppcg_ht_bounds *bounds, + int pos) +{ + if (!bounds) + return NULL; + return isl_multi_val_get_val(bounds->lower, pos); +} + +/* Replace the lower bound on the relative dependence distances + * in space dimension "pos" by "lower". + */ +__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_lower( + __isl_take ppcg_ht_bounds *bounds, int pos, __isl_take isl_val *lower) +{ + if (!bounds || !lower) + goto error; + bounds->lower = isl_multi_val_set_val(bounds->lower, pos, lower); + if (!bounds->lower) + return ppcg_ht_bounds_free(bounds); + return bounds; +error: + ppcg_ht_bounds_free(bounds); + isl_val_free(lower); + return NULL; +} + +/* Can the bounds on relative dependence distances recorded in "bounds" + * be used to perform hybrid tiling? + * In particular, have appropriate lower and upper bounds been found? + * Any NaN indicates that no corresponding bound was found. + */ +isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds) +{ + isl_bool is_nan; + int i, n; + + if (!bounds) + return isl_bool_error; + is_nan = isl_val_is_nan(bounds->upper); + if (is_nan < 0) + return isl_bool_error; + if (is_nan) + return isl_bool_false; + + n = isl_multi_val_dim(bounds->lower, isl_dim_set); + for (i = 0; i < n; ++i) { + isl_val *v; + + v = isl_multi_val_get_val(bounds->lower, i); + is_nan = isl_val_is_nan(v); + if (is_nan < 0) + return isl_bool_error; + if (is_nan) + return isl_bool_false; + isl_val_free(v); + } + + return isl_bool_true; +} + +/* Structure that represents the basic hexagonal tiling, + * along with information that is needed to perform the hybrid tiling. + * + * "bounds" are the bounds on the dependence distances that + * define the hexagonal shape and the required skewing in the remaining + * space dimensions. + * + * "input_node" points to the input pair of band nodes. + * "input_schedule" is the partial schedule of this input pair of band nodes. + * The space of this schedule is [P -> C], where P is the space + * of the parent node and C is the space of the child node. + * + * "space_sizes" represent the total size of a tile for the space + * dimensions, i.e., those corresponding to the child node. + * The space of "space_sizes" is C. + * If S_0 is the original tile size in the first space dimension, + * then the first entry of "space_sizes" is equal to + * W = 2*S_0 + floor(d_l h) + floor(d_u h). + * The remaining entries are the same as in the original tile sizes. + * + * The basic hexagonal tiling "hex" is defined + * in a "ts" (time-space) space and corresponds to the phase-1 tiles. + * "time_tile" maps the "ts" space to outer time tile. + * Is is equal to ts[t, s] -> floor(t/(2 * S_t)), with S_t the original tile + * size corresponding to the parent node. + * "local_time" maps the "ts" space to the time dimension inside each tile. + * It is equal to ts[t, s] -> t mod (2 S_t), with S_t the original tile + * size corresponding to the parent node. + * "shift_space" shifts the tiles at time tile T = floor(t/(2 S_t)) + * in the space dimension such that they align to a multiple of W. + * It is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W, + * with shift_s = S_0 + floor(d_u h). + * "shift_phase" is the shift taken to go from phase 0 to phase 1 + * It is equal to ts[t, s] -> ts[t + S_t, s + shift_s], + * with shift_s = S_0 + floor(d_u h). + * + * "project_ts" projects the space of the input schedule to the ts-space. + * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0]. + */ +struct ppcg_ht_tiling { + int ref; + + ppcg_ht_bounds *bounds; + isl_schedule_node *input_node; + isl_multi_union_pw_aff *input_schedule; + + isl_multi_val *space_sizes; + + isl_aff *time_tile; + isl_aff *local_time; + isl_aff *shift_space; + isl_multi_aff *shift_phase; + isl_set *hex; + + isl_multi_aff *project_ts; +}; +typedef struct ppcg_ht_tiling ppcg_ht_tiling; + +/* Return the space of the pair of band nodes that form the input + * to the hybrid tiling. + * In particular, return the space [P -> C], where P is the space + * of the parent node and C is the space of the child node. + */ +__isl_give isl_space *ppcg_ht_tiling_get_input_space( + __isl_keep ppcg_ht_tiling *tile) +{ + if (!tile) + return NULL; + + return isl_multi_union_pw_aff_get_space(tile->input_schedule); +} + +/* Remove a reference to "tile" and free "tile" along with all its fields + * as soon as the reference count drops to zero. + */ +static __isl_null ppcg_ht_tiling *ppcg_ht_tiling_free( + __isl_take ppcg_ht_tiling *tiling) +{ + if (!tiling) + return NULL; + if (--tiling->ref > 0) + return NULL; + + ppcg_ht_bounds_free(tiling->bounds); + isl_schedule_node_free(tiling->input_node); + isl_multi_union_pw_aff_free(tiling->input_schedule); + isl_multi_val_free(tiling->space_sizes); + isl_aff_free(tiling->time_tile); + isl_aff_free(tiling->local_time); + isl_aff_free(tiling->shift_space); + isl_multi_aff_free(tiling->shift_phase); + isl_set_free(tiling->hex); + isl_multi_aff_free(tiling->project_ts); + free(tiling); + + return NULL; +} + +/* Return a new reference to "tiling". + */ +__isl_give ppcg_ht_tiling *ppcg_ht_tiling_copy( + __isl_keep ppcg_ht_tiling *tiling) +{ + if (!tiling) + return NULL; + + tiling->ref++; + return tiling; +} + +/* Return the isl_ctx to which "tiling" belongs. + */ +isl_ctx *ppcg_ht_tiling_get_ctx(__isl_keep ppcg_ht_tiling *tiling) +{ + if (!tiling) + return NULL; + + return isl_multi_union_pw_aff_get_ctx(tiling->input_schedule); +} + +/* Representation of one of the two phases of hybrid tiling. + * + * "tiling" points to the shared tiling data. + * + * "time_tile", "local_time" and "shift_space" are equal to the corresponding + * fields of "tiling", pulled back to the input space. + * In case of phase 0, these expressions have also been moved + * from phase 1 to phase 0. + * + * "domain" contains the hexagonal tiling of this phase. + * + * "space_shift" is the shift that should be added to the space band + * in order to be able to apply rectangular tiling to the space. + * For phase 1, it is equal to + * + * [P[t] -> C[s_0, s_i]] -> C[(-(2 * shift_s)*T) % W, dl_i * u] + * + * with shift_s = S_0 + floor(d_u h), + * T equal to "time_tile" and u equal to "local_time". + * For phase 0, it is equal to + * + * [P[t] -> C[s_0, s_i]] -> C[shift_s + (-(2 * shift_s)*T) % W, dl_i * u] + * + * "space_tile" is the space tiling. It is equal to + * + * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size] + */ +struct ppcg_ht_phase { + ppcg_ht_tiling *tiling; + + isl_aff *time_tile; + isl_aff *local_time; + isl_aff *shift_space; + isl_set *domain; + + isl_multi_aff *space_shift; + isl_multi_aff *space_tile; +}; + +/* Free "phase" along with all its fields. + */ +static __isl_null ppcg_ht_phase *ppcg_ht_phase_free( + __isl_take ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + ppcg_ht_tiling_free(phase->tiling); + isl_aff_free(phase->time_tile); + isl_aff_free(phase->local_time); + isl_aff_free(phase->shift_space); + isl_set_free(phase->domain); + isl_multi_aff_free(phase->space_shift); + isl_multi_aff_free(phase->space_tile); + free(phase); + + return NULL; +} + +/* Wrapper around ppcg_ht_phase_free for use as an argument + * to isl_id_set_free_user. + */ +static void ppcg_ht_phase_free_wrap(void *user) +{ + ppcg_ht_phase *phase = user; + + ppcg_ht_phase_free(phase); +} + +/* Return the domain of hybrid tiling phase "phase". + */ +static __isl_give isl_set *ppcg_ht_phase_get_domain(ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + return isl_set_copy(phase->domain); +} + +/* Return the space of the pair of band nodes that form the input + * to the hybrid tiling of which "phase" is a phase. + * In particular, return the space [P -> C], where P is the space + * of the parent node and C is the space of the child node. + */ +static __isl_give isl_space *ppcg_ht_phase_get_input_space( + __isl_keep ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + return ppcg_ht_tiling_get_input_space(phase->tiling); +} + +/* Construct the lower left constraint of the hexagonal tile, i.e., + * + * du a - b <= (2h+1) du - duh + * -du a + b + (2h+1) du - duh >= 0 + * + * where duh = floor(du * h). + * + * This constraint corresponds to (6) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_lower_left(__isl_take isl_local_space *ls, + __isl_keep isl_val *h, __isl_keep isl_val *du, __isl_keep isl_val *duh) +{ + isl_val *v; + isl_aff *aff; + + v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); + v = isl_val_mul(v, isl_val_copy(du)); + v = isl_val_sub(v, isl_val_copy(duh)); + aff = isl_aff_val_on_domain(ls, v); + v = isl_val_neg(isl_val_copy(du)); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the lower constraint of the hexagonal tile, i.e., + * + * a <= 2h+1 + * -a + 2h+1 >= 0 + * + * This constraint corresponds to (7) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_lower(__isl_take isl_local_space *ls, + __isl_keep isl_val *h) +{ + isl_val *v; + isl_aff *aff; + + v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); + aff = isl_aff_val_on_domain(ls, v); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 0, -1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the lower right constraint of the hexagonal tile, i.e., + * + * dl a + b <= (2h+1) dl + duh + (s0-1) + * -dl a - b + (2h+1) dl + duh + (s0-1) >= 0 + * + * where duh = floor(du * h). + * + * This constraint corresponds to (8) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_lower_right( + __isl_take isl_local_space *ls, __isl_keep isl_val *h, + __isl_keep isl_val *s0, __isl_keep isl_val *dl, __isl_keep isl_val *duh) +{ + isl_val *v; + isl_aff *aff; + + v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1); + v = isl_val_mul(v, isl_val_copy(dl)); + v = isl_val_add(v, isl_val_copy(duh)); + v = isl_val_add(v, isl_val_copy(s0)); + v = isl_val_sub_ui(v, 1); + aff = isl_aff_val_on_domain(ls, v); + v = isl_val_neg(isl_val_copy(dl)); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the upper left constraint of the hexagonal tile, i.e., + * + * dl a + b >= h dl - (d - 1)/d with d = den(dl) + * dl a + b - h dl + (d - 1)/d >= 0 + * + * This constraint corresponds to (10) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_upper_left(__isl_take isl_local_space *ls, + __isl_keep isl_val *h, __isl_keep isl_val *dl) +{ + isl_val *v, *d; + isl_aff *aff; + + d = isl_val_get_den_val(dl); + v = isl_val_sub_ui(isl_val_copy(d), 1); + v = isl_val_div(v, d); + v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(dl))); + aff = isl_aff_val_on_domain(ls, v); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(dl)); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the upper right constraint of the hexagonal tile, i.e., + * + * du a - b >= du h - duh - (s0-1) - dlh - (d - 1)/d with d = den(du) + * du a - b - du h + duh + (s0-1) + dlh + (d - 1)/d >= 0 + * + * where dlh = floor(dl * h) and duh = floor(du * h). + * + * This constraint corresponds to (12) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_upper_right( + __isl_take isl_local_space *ls, __isl_keep isl_val *h, + __isl_keep isl_val *s0, __isl_keep isl_val *du, + __isl_keep isl_val *dlh, __isl_keep isl_val *duh) +{ + isl_val *v, *d; + isl_aff *aff; + + d = isl_val_get_den_val(du); + v = isl_val_sub_ui(isl_val_copy(d), 1); + v = isl_val_div(v, d); + v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(du))); + v = isl_val_add(v, isl_val_copy(duh)); + v = isl_val_add(v, isl_val_copy(dlh)); + v = isl_val_add(v, isl_val_copy(s0)); + v = isl_val_sub_ui(v, 1); + aff = isl_aff_val_on_domain(ls, v); + aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(du)); + aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1); + + return isl_inequality_from_aff(aff); +} + +/* Construct the uppper constraint of the hexagonal tile, i.e., + * + * a >= 0 + * + * This constraint corresponds to (13) in + * "Hybrid Hexagonal/Classical Tiling for GPUs". + */ +static __isl_give isl_constraint *hex_upper(__isl_take isl_local_space *ls) +{ + isl_aff *aff; + + aff = isl_aff_var_on_domain(ls, isl_dim_set, 0); + + return isl_inequality_from_aff(aff); +} + +/* Construct the basic hexagonal tile shape. + * "space" is the 2D space in which the hexagon should be constructed. + * h is st-1, with st the tile size in the time dimension + * s0 is the tile size in the space dimension + * dl is a bound on the negative relative dependence distances, i.e., + * + * d_s >= -dl d_t + * + * du is a bound on the positive relative dependence distances, i.e., + * + * d_s <= du d_t + * + * with (d_t,d_s) any dependence distance vector. + * dlh = floor(dl * h) + * duh = floor(du * h) + * + * The shape of the hexagon is as follows: + * + * 0 dlh dlh+s0-1 + * ______ __ + * 0 / \_ / + * / \_ / + * h / \ ______ / + * h+1 \_ // \\_ + * \_ // \\_ + * 2h+1 \______// \\ + * 0 duh duh+s0-1 + * duh+s0-1+dlh + * duh+s0-1+dlh+1+s0+1 + * + * The next hexagon is shifted by duh + dlh + 2 * s0. + * + * The slope of the "/" constraints is dl. + * The slope of the "\_" constraints is du. + */ +static __isl_give isl_set *compute_hexagon(__isl_take isl_space *space, + __isl_keep isl_val *h, __isl_keep isl_val *s0, + __isl_keep isl_val *dl, __isl_keep isl_val *du, + __isl_keep isl_val *dlh, __isl_keep isl_val *duh) +{ + isl_local_space *ls; + isl_constraint *c; + isl_basic_set *bset; + + ls = isl_local_space_from_space(space); + + c = hex_lower_left(isl_local_space_copy(ls), h, du, duh); + bset = isl_basic_set_from_constraint(c); + + c = hex_lower(isl_local_space_copy(ls), h); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_lower_right(isl_local_space_copy(ls), h, s0, dl, duh); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_upper_left(isl_local_space_copy(ls), h, dl); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_upper_right(isl_local_space_copy(ls), h, s0, du, dlh, duh); + bset = isl_basic_set_add_constraint(bset, c); + + c = hex_upper(ls); + bset = isl_basic_set_add_constraint(bset, c); + + return isl_set_from_basic_set(bset); +} + +/* Name of the ts-space. + */ +static const char *ts_space_name = "ts"; + +/* Construct and return the space ts[t, s]. + */ +static __isl_give isl_space *construct_ts_space(isl_ctx *ctx) +{ + isl_space *s; + + s = isl_space_set_alloc(ctx, 0, 2); + s = isl_space_set_tuple_name(s, isl_dim_set, ts_space_name); + + return s; +} + +/* Name of the local ts-space. + */ +static const char *local_ts_space_name = "local_ts"; + +/* Construct and return the space local_ts[t, s]. + */ +static __isl_give isl_space *construct_local_ts_space(isl_ctx *ctx) +{ + isl_space *s; + + s = isl_space_set_alloc(ctx, 0, 2); + s = isl_space_set_tuple_name(s, isl_dim_set, local_ts_space_name); + + return s; +} + +/* Compute the total size of a tile for the space dimensions, + * i.e., those corresponding to the child node + * of the input pattern. + * If S_0 is the original tile size in the first space dimension, + * then the first entry of "space_sizes" is equal to + * W = 2*S_0 + floor(d_l h) + floor(d_u h). + * The remaining entries are the same as in the original tile sizes. + * "tile_sizes" contains the original tile sizes, including + * the tile size corresponding to the parent node. + * "dlh" is equal to floor(d_l h). + * "duh" is equal to floor(d_u h). + */ +static __isl_give isl_multi_val *compute_space_sizes( + __isl_keep isl_multi_val *tile_sizes, + __isl_keep isl_val *dlh, __isl_keep isl_val *duh) +{ + isl_val *size; + isl_multi_val *space_sizes; + + space_sizes = isl_multi_val_copy(tile_sizes); + space_sizes = isl_multi_val_factor_range(space_sizes); + size = isl_multi_val_get_val(space_sizes, 0); + size = isl_val_mul_ui(size, 2); + size = isl_val_add(size, isl_val_copy(duh)); + size = isl_val_add(size, isl_val_copy(dlh)); + space_sizes = isl_multi_val_set_val(space_sizes, 0, size); + + return space_sizes; +} + +/* Compute the offset of phase 1 with respect to phase 0 + * in the ts-space ("space"). + * In particular, return + * + * ts[st, s0 + duh] + */ +static __isl_give isl_multi_val *compute_phase_shift( + __isl_keep isl_space *space, __isl_keep isl_val *st, + __isl_keep isl_val *s0, __isl_keep isl_val *duh) +{ + isl_val *v; + isl_multi_val *phase_shift; + + phase_shift = isl_multi_val_zero(isl_space_copy(space)); + phase_shift = isl_multi_val_set_val(phase_shift, 0, isl_val_copy(st)); + v = isl_val_add(isl_val_copy(duh), isl_val_copy(s0)); + phase_shift = isl_multi_val_set_val(phase_shift, 1, v); + + return phase_shift; +} + +/* Return the function + * + * ts[t, s] -> floor(t/(2 * st)) + * + * representing the time tile. + * "space" is the space ts[t, s]. + */ +static __isl_give isl_aff *compute_time_tile(__isl_keep isl_space *space, + __isl_keep isl_val *st) +{ + isl_val *v; + isl_aff *t; + isl_local_space *ls; + + ls = isl_local_space_from_space(isl_space_copy(space)); + t = isl_aff_var_on_domain(ls, isl_dim_set, 0); + v = isl_val_mul_ui(isl_val_copy(st), 2); + t = isl_aff_floor(isl_aff_scale_down_val(t, v)); + + return t; +} + +/* Compute a shift in the space dimension for tiles + * at time tile T = floor(t/(2 * S_t)) + * such that they align to a multiple of the total space tile dimension W. + * In particular, compute + * + * ts[t, s] -> s + (-(2 * shift_s)*T) % W + * + * where shift_s is the shift of phase 1 with respect to phase 0 + * in the space dimension (the first element of "phase_shift"). + * W is stored in the first element of "space_sizes". + * "time_tile" is the function + * + * ts[t, s] -> floor(t/(2 * S_T)) + * + * Since phase 1 is shifted by shift_s with respect to phase 0, + * the next line of phase 0 (at T+1) is shifted by 2*shift_s + * with respect to the previous line (at T). + * A shift of -(2 * shift_s)*T therefore allows the basic pattern + * (which starts at 0) to be applied. + * However, this shift will be used to obtain the tile coordinate + * in the first space dimension and if the original values + * in the space dimension are non-negative, then the shift should + * not make them negative. Moreover, the shift should be as minimal + * as possible. + * Since the pattern repeats itself with a period of W in the space + * dimension, the shift can be replaced by (-(2 * shift_s)*T) % W. + */ +static __isl_give isl_aff *compute_shift_space(__isl_keep isl_aff *time_tile, + __isl_keep isl_multi_val *space_sizes, + __isl_keep isl_multi_val *phase_shift) +{ + isl_val *v; + isl_aff *s, *t; + isl_local_space *ls; + + ls = isl_local_space_from_space(isl_aff_get_domain_space(time_tile)); + t = isl_aff_copy(time_tile); + v = isl_val_mul_ui(isl_multi_val_get_val(phase_shift, 1), 2); + v = isl_val_neg(v); + t = isl_aff_scale_val(t, v); + v = isl_multi_val_get_val(space_sizes, 0); + t = isl_aff_mod_val(t, v); + s = isl_aff_var_on_domain(ls, isl_dim_set, 1); + s = isl_aff_add(s, t); + + return s; +} + +/* Give the phase_shift ts[S_t, S_0 + floor(d_u h)], + * compute a function that applies the shift, i.e., + * + * ts[t, s] -> ts[t + S_t, s + S_0 + floor(d_u h)], + */ +static __isl_give isl_multi_aff *compute_shift_phase( + __isl_keep isl_multi_val *phase_shift) +{ + isl_space *space; + isl_multi_aff *shift; + + space = isl_multi_val_get_space(phase_shift); + shift = isl_multi_aff_multi_val_on_space(space, + isl_multi_val_copy(phase_shift)); + space = isl_multi_aff_get_space(shift); + shift = isl_multi_aff_add(shift, isl_multi_aff_identity(space)); + + return shift; +} + +/* Compute a mapping from the ts-space to the local coordinates + * within each tile. In particular, compute + * + * ts[t, s] -> local_ts[t % (2 S_t), (s + (-(2 * shift_s)*T) % W) % W] + * + * "ts" is the space ts[t, s] + * "local_ts" is the space local_ts[t, s] + * "shift_space" is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W + * "st" is the tile size in the time dimension S_t. + * The first element of "space_sizes" is equal to W. + */ +static __isl_give isl_multi_aff *compute_localize( + __isl_keep isl_space *local_ts, __isl_keep isl_aff *shift_space, + __isl_keep isl_val *st, __isl_keep isl_multi_val *space_sizes) +{ + isl_val *v; + isl_space *space; + isl_aff *s, *t; + isl_multi_aff *localize; + + space = isl_aff_get_domain_space(shift_space); + local_ts = isl_space_copy(local_ts); + space = isl_space_map_from_domain_and_range(space, local_ts); + localize = isl_multi_aff_identity(space); + t = isl_multi_aff_get_aff(localize, 0); + v = isl_val_mul_ui(isl_val_copy(st), 2); + t = isl_aff_mod_val(t, v); + localize = isl_multi_aff_set_aff(localize, 0, t); + s = isl_aff_copy(shift_space); + v = isl_multi_val_get_val(space_sizes, 0); + s = isl_aff_mod_val(s, v); + localize = isl_multi_aff_set_aff(localize, 1, s); + + return localize; +} + +/* Set the project_ts field of "tiling". + * + * This field projects the space of the input schedule to the ts-space. + * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0]. + */ +static __isl_give ppcg_ht_tiling *ppcg_ht_tiling_set_project_ts( + __isl_take ppcg_ht_tiling *tiling) +{ + int n; + isl_space *space; + isl_multi_aff *project; + + if (!tiling) + return NULL; + + space = ppcg_ht_tiling_get_input_space(tiling); + n = isl_space_dim(space, isl_dim_set); + project = isl_multi_aff_project_out_map(space, isl_dim_set, 2, n - 2); + project = isl_multi_aff_set_tuple_name(project, + isl_dim_out, ts_space_name); + if (!project) + return ppcg_ht_tiling_free(tiling); + + tiling->project_ts = project; + + return tiling; +} + +/* Construct a hybrid tiling description from bounds on the dependence + * distances "bounds". + * "input_node" points to the original parent node. + * "input_schedule" is the combined schedule of the parent and child + * node in the input. + * "tile_sizes" are the original, user specified tile sizes. + */ +static __isl_give ppcg_ht_tiling *ppcg_ht_bounds_construct_tiling( + __isl_take ppcg_ht_bounds *bounds, + __isl_keep isl_schedule_node *input_node, + __isl_keep isl_multi_union_pw_aff *input_schedule, + __isl_keep isl_multi_val *tile_sizes) +{ + isl_ctx *ctx; + ppcg_ht_tiling *tiling; + isl_multi_val *space_sizes, *phase_shift; + isl_aff *time_tile, *shift_space; + isl_multi_aff *localize; + isl_val *h, *duh, *dlh; + isl_val *st, *s0, *du, *dl; + isl_space *ts, *local_ts; + + if (!bounds || !input_node || !input_schedule || !tile_sizes) + goto error; + + ctx = isl_multi_union_pw_aff_get_ctx(input_schedule); + tiling = isl_calloc_type(ctx, struct ppcg_ht_tiling); + if (!tiling) + goto error; + tiling->ref = 1; + + st = isl_multi_val_get_val(tile_sizes, 0); + h = isl_val_sub_ui(isl_val_copy(st), 1); + s0 = isl_multi_val_get_val(tile_sizes, 1); + du = ppcg_ht_bounds_get_upper(bounds); + dl = ppcg_ht_bounds_get_lower(bounds, 0); + + duh = isl_val_floor(isl_val_mul(isl_val_copy(du), isl_val_copy(h))); + dlh = isl_val_floor(isl_val_mul(isl_val_copy(dl), isl_val_copy(h))); + + ts = construct_ts_space(ctx); + local_ts = construct_local_ts_space(ctx); + + space_sizes = compute_space_sizes(tile_sizes, dlh, duh); + phase_shift = compute_phase_shift(ts, st, s0, duh); + time_tile = compute_time_tile(ts, st); + shift_space = compute_shift_space(time_tile, space_sizes, phase_shift); + localize = compute_localize(local_ts, shift_space, st, space_sizes); + isl_space_free(ts); + + tiling->input_node = isl_schedule_node_copy(input_node); + tiling->input_schedule = isl_multi_union_pw_aff_copy(input_schedule); + tiling->space_sizes = space_sizes; + tiling->bounds = bounds; + tiling->local_time = isl_multi_aff_get_aff(localize, 0); + tiling->hex = compute_hexagon(local_ts, h, s0, dl, du, dlh, duh); + tiling->hex = isl_set_preimage_multi_aff(tiling->hex, localize); + tiling->time_tile = time_tile; + tiling->shift_space = shift_space; + tiling->shift_phase = compute_shift_phase(phase_shift); + isl_multi_val_free(phase_shift); + + isl_val_free(duh); + isl_val_free(dlh); + isl_val_free(du); + isl_val_free(dl); + isl_val_free(s0); + isl_val_free(st); + isl_val_free(h); + + if (!tiling->input_schedule || !tiling->local_time || !tiling->hex || + !tiling->shift_space || !tiling->shift_phase) + return ppcg_ht_tiling_free(tiling); + + tiling = ppcg_ht_tiling_set_project_ts(tiling); + + return tiling; +error: + ppcg_ht_bounds_free(bounds); + return NULL; +} + +/* Are all members of the band node "node" coincident? + */ +static isl_bool all_coincident(__isl_keep isl_schedule_node *node) +{ + int i, n; + + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) { + isl_bool c; + + c = isl_schedule_node_band_member_get_coincident(node, i); + if (c < 0 || !c) + return c; + } + + return isl_bool_true; +} + +/* Does "node" satisfy the properties of the inner node in the input + * pattern for hybrid tiling? + * That is, is it a band node with only coincident members, of which + * there is at least one? + */ +static isl_bool has_child_properties(__isl_keep isl_schedule_node *node) +{ + if (!node) + return isl_bool_error; + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return isl_bool_false; + if (isl_schedule_node_band_n_member(node) < 1) + return isl_bool_false; + return all_coincident(node); +} + +/* Does "node" satisfy the properties of the outer node in the input + * pattern for hybrid tiling? + * That is, is it a band node with a single member? + */ +static isl_bool has_parent_properties(__isl_keep isl_schedule_node *node) +{ + if (!node) + return isl_bool_error; + if (isl_schedule_node_get_type(node) != isl_schedule_node_band) + return isl_bool_false; + if (isl_schedule_node_band_n_member(node) != 1) + return isl_bool_false; + return isl_bool_true; +} + +/* Does the parent of "node" satisfy the input patttern for hybrid tiling? + * That is, does "node" satisfy the properties of the inner node and + * does the parent of "node" satisfy the properties of the outer node? + */ +isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node) +{ + isl_bool has_pattern; + + has_pattern = has_child_properties(node); + if (has_pattern < 0 || !has_pattern) + return has_pattern; + + node = isl_schedule_node_copy(node); + node = isl_schedule_node_parent(node); + has_pattern = has_parent_properties(node); + isl_schedule_node_free(node); + + return has_pattern; +} + +/* Does "node" satisfy the input patttern for hybrid tiling? + * That is, does "node" satisfy the properties of the outer node and + * does the child of "node" satisfy the properties of the inner node? + */ +isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node) +{ + isl_bool has_pattern; + + has_pattern = has_parent_properties(node); + if (has_pattern < 0 || !has_pattern) + return has_pattern; + + node = isl_schedule_node_get_child(node, 0); + has_pattern = has_child_properties(node); + isl_schedule_node_free(node); + + return has_pattern; +} + +/* Check that "node" satisfies the input pattern for hybrid tiling. + * Error out if it does not. + */ +static isl_stat check_input_pattern(__isl_keep isl_schedule_node *node) +{ + isl_bool has_pattern; + + has_pattern = ppcg_ht_has_input_pattern(node); + if (has_pattern < 0) + return isl_stat_error; + if (!has_pattern) + isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid, + "invalid input pattern for hybrid tiling", + return isl_stat_error); + + return isl_stat_ok; +} + +/* Extract the input schedule from "node", i.e., the product + * of the partial schedules of the parent and child nodes + * in the input pattern. + */ +static __isl_give isl_multi_union_pw_aff *extract_input_schedule( + __isl_keep isl_schedule_node *node) +{ + isl_multi_union_pw_aff *partial, *partial2; + + partial = isl_schedule_node_band_get_partial_schedule(node); + node = isl_schedule_node_get_child(node, 0); + partial2 = isl_schedule_node_band_get_partial_schedule(node); + isl_schedule_node_free(node); + + return isl_multi_union_pw_aff_range_product(partial, partial2); +} + +/* Collect all dependences from "scop" that are relevant for performing + * hybrid tiling on "node" and its child and map them to the schedule + * space of this pair of nodes. + * + * In case live range reordering is not used, + * the flow and the false dependences are collected. + * In case live range reordering is used, + * the flow and the forced dependences are collected, as well + * as the order dependences that are adjacent to non-local + * flow dependences. + * + * In all cases, only dependences that map to the same instance + * of the outer part of the schedule are considered. + */ +static __isl_give isl_map *collect_deps(struct ppcg_scop *scop, + __isl_keep isl_schedule_node *node) +{ + isl_space *space; + isl_multi_union_pw_aff *prefix, *partial; + isl_union_map *flow, *other, *dep, *umap; + isl_map *map; + + prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node); + partial = extract_input_schedule(node); + space = isl_multi_union_pw_aff_get_space(partial); + + flow = isl_union_map_copy(scop->dep_flow); + flow = isl_union_map_eq_at_multi_union_pw_aff(flow, + isl_multi_union_pw_aff_copy(prefix)); + if (!scop->options->live_range_reordering) { + other = isl_union_map_copy(scop->dep_false); + other = isl_union_map_eq_at_multi_union_pw_aff(other, prefix); + } else { + isl_union_map *local, *non_local, *order, *adj; + isl_union_set *domain, *range; + + other = isl_union_map_copy(scop->dep_forced); + other = isl_union_map_eq_at_multi_union_pw_aff(other, + isl_multi_union_pw_aff_copy(prefix)); + local = isl_union_map_copy(flow); + local = isl_union_map_eq_at_multi_union_pw_aff(local, + isl_multi_union_pw_aff_copy(partial)); + non_local = isl_union_map_copy(flow); + non_local = isl_union_map_subtract(non_local, local); + + order = isl_union_map_copy(scop->dep_order); + order = isl_union_map_eq_at_multi_union_pw_aff(order, prefix); + adj = isl_union_map_copy(order); + domain = isl_union_map_domain(isl_union_map_copy(non_local)); + domain = isl_union_set_coalesce(domain); + adj = isl_union_map_intersect_range(adj, domain); + other = isl_union_map_union(other, adj); + + adj = order; + range = isl_union_map_range(non_local); + range = isl_union_set_coalesce(range); + adj = isl_union_map_intersect_domain(adj, range); + other = isl_union_map_union(other, adj); + } + dep = isl_union_map_union(flow, other); + + umap = isl_union_map_from_multi_union_pw_aff(partial); + dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap)); + dep = isl_union_map_apply_range(dep, umap); + + space = isl_space_map_from_set(space); + map = isl_union_map_extract_map(dep, space); + isl_union_map_free(dep); + + map = isl_map_coalesce(map); + + return map; +} + +/* Given a constraint of the form + * + * a i_0 + b i_1 >= 0 + * or + * a i_0 + b i_1 = 0 + * + * use it to update one or both of the non-negative bounds + * in "list" = (min, max) such that + * + * i_1 >= -min i_0 + * and + * i_1 <= max i_0 + * + * If b = 0, then the constraint cannot be used. + * Otherwise, the constraint is equivalent to + * + * sgn(b) i_1 >= - a/abs(b) i_0 + * i.e., + * i_1 >= - a/abs(b) i_0 + * or + * i_1 <= a/abs(b) i_0 + * + * Set the first or second element of "list" to max(0, a/abs(b)), + * according to the sign of "b". Or set both in case the constraint + * is an equality, taking into account the sign change. + */ +static __isl_give isl_val_list *list_set_min_max(__isl_take isl_val_list *list, + __isl_keep isl_constraint *c) +{ + isl_val *a, *b; + int sign; + int pos; + isl_bool eq, is_zero, is_neg; + + eq = isl_constraint_is_equality(c); + if (eq < 0) + return isl_val_list_free(list); + + b = isl_constraint_get_coefficient_val(c, isl_dim_set, 1); + is_zero = isl_val_is_zero(b); + if (is_zero == isl_bool_true) { + isl_val_free(b); + return list; + } + a = isl_constraint_get_coefficient_val(c, isl_dim_set, 0); + sign = isl_val_sgn(b); + b = isl_val_abs(b); + a = isl_val_div(a, b); + + if (eq) + b = isl_val_copy(a); + + pos = sign > 0 ? 0 : 1; + is_neg = isl_val_is_neg(a); + if (is_neg == isl_bool_true) + a = isl_val_set_si(a, 0); + list = isl_val_list_set_val(list, pos, a); + + if (!eq) + return is_neg < 0 ? isl_val_list_free(list) : list; + + pos = 1 - pos; + a = isl_val_neg(b); + is_neg = isl_val_is_neg(a); + if (is_neg == isl_bool_true) + a = isl_val_set_si(a, 0); + list = isl_val_list_set_val(list, pos, a); + + return is_neg < 0 ? isl_val_list_free(list) : list; +} + +/* If constraint "c" passes through the origin, then try and use it + * to update the non-negative bounds in "list" = (min, max) such that + * + * i_1 >= -min i_0 + * and + * i_1 <= max i_0 + */ +static isl_stat set_min_max(__isl_take isl_constraint *c, void *user) +{ + isl_val *v; + isl_val_list **list = user; + isl_bool is_zero; + + v = isl_constraint_get_constant_val(c); + is_zero = isl_val_is_zero(v); + isl_val_free(v); + + if (is_zero == isl_bool_true) + *list = list_set_min_max(*list, c); + + isl_constraint_free(c); + return is_zero < 0 ? isl_stat_error : isl_stat_ok; +} + +/* Given a set of dependence distance vectors "dist", compute + * pair of non-negative bounds min and max such that + * + * d_pos >= -min d_0 + * and + * d_pos <= max d_0 + * + * and return the pair (min, max). + * If no bound can be found in either direction, then the bound + * is replaced by NaN. + * + * The dependence distances are first projected onto the (d_0, d_pos). + * Then the zero dependence distance is added and the convex hull is computed. + * Finally, the bounds are extracted from the constraints of the convex hull + * that pass through the origin. + */ +static __isl_give isl_val_list *min_max_dist(__isl_keep isl_set *dist, int pos) +{ + isl_space *space; + isl_basic_set *hull; + int dim; + isl_ctx *ctx; + isl_val *nan; + isl_val_list *list; + + ctx = isl_set_get_ctx(dist); + nan = isl_val_nan(ctx); + list = isl_val_list_alloc(ctx, 2); + list = isl_val_list_add(list, isl_val_copy(nan)); + list = isl_val_list_add(list, nan); + + dist = isl_set_copy(dist); + dim = isl_set_dim(dist, isl_dim_set); + if (dist && pos >= dim) + isl_die(ctx, isl_error_internal, "position out of bounds", + dist = isl_set_free(dist)); + dist = isl_set_project_out(dist, isl_dim_set, pos + 1, dim - (pos + 1)); + dist = isl_set_project_out(dist, isl_dim_set, 1, pos - 1); + + space = isl_set_get_space(dist); + dist = isl_set_union(dist, isl_set_from_point(isl_point_zero(space))); + dist = isl_set_remove_divs(dist); + hull = isl_set_convex_hull(dist); + + if (isl_basic_set_foreach_constraint(hull, &set_min_max, &list) < 0) + list = isl_val_list_free(list); + isl_basic_set_free(hull); + + return list; +} + +/* Given a schedule node "node" that, together with its child, + * satisfies the input pattern for hybrid tiling, compute bounds + * on the relative dependence distances of the child node with + * respect to the parent node. These bounds are needed to + * construct a hybrid tiling. + * + * First all relevant dependences are collected and mapped + * to the schedule space of the pair of nodes. Then, the + * dependence distances are computed in this space. + * + * These dependence distances are then projected onto a two-dimensional + * space consisting of the single schedule dimension of the outer node + * and one of the schedule dimensions of the inner node. + * The maximal and minimal relative dependence distances are extracted + * from these projections. + * This process is repeated for each of the schedule dimensions + * of the inner node. For the first dimension, both minimal and + * maximal relative dependence distances are stored in the result. + * For the other dimensions, only the minimal relative dependence + * distance is stored. + */ +__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop, + __isl_keep isl_schedule_node *node) +{ + ppcg_ht_bounds *bnd; + isl_space *space; + isl_map *map; + isl_set *dist; + isl_val_list *pair; + isl_schedule_node *child; + int n; + int i, dim; + + if (!scop || !node || check_input_pattern(node) < 0) + return NULL; + + child = isl_schedule_node_get_child(node, 0); + space = isl_schedule_node_band_get_space(child); + dim = isl_schedule_node_band_n_member(child); + isl_schedule_node_free(child); + bnd = ppcg_ht_bounds_alloc(space); + if (!bnd) + return NULL; + + map = collect_deps(scop, node); + + dist = isl_map_deltas(map); + n = isl_set_dim(dist, isl_dim_param); + dist = isl_set_project_out(dist, isl_dim_param, 0, n); + + pair = min_max_dist(dist, 1); + bnd = ppcg_ht_bounds_set_lower(bnd, 0, isl_val_list_get_val(pair, 0)); + bnd = ppcg_ht_bounds_set_upper(bnd, isl_val_list_get_val(pair, 1)); + isl_val_list_free(pair); + + for (i = 1; i < dim; ++i) { + pair = min_max_dist(dist, 1 + i); + bnd = ppcg_ht_bounds_set_lower(bnd, i, + isl_val_list_get_val(pair, 0)); + isl_val_list_free(pair); + } + + isl_set_free(dist); + + return bnd; +} + +/* Check if all the fields of "phase" are valid, freeing "phase" + * if they are not. + */ +static __isl_give ppcg_ht_phase *check_phase(__isl_take ppcg_ht_phase *phase) +{ + if (!phase) + return NULL; + + if (!phase->tiling || !phase->local_time || + !phase->shift_space || !phase->domain) + return ppcg_ht_phase_free(phase); + + return phase; +} + +/* Construct a ppcg_ht_phase object, that simply copies + * information from "tiling". + * That is, the result is defined over the "ts" space and + * corresponds to phase 1. + */ +static __isl_give ppcg_ht_phase *construct_phase( + __isl_keep ppcg_ht_tiling *tiling) +{ + isl_ctx *ctx; + ppcg_ht_phase *phase; + + if (!tiling) + return NULL; + + ctx = ppcg_ht_tiling_get_ctx(tiling); + phase = isl_calloc_type(ctx, struct ppcg_ht_phase); + if (!phase) + return NULL; + phase->tiling = ppcg_ht_tiling_copy(tiling); + phase->time_tile = isl_aff_copy(tiling->time_tile); + phase->local_time = isl_aff_copy(tiling->local_time); + phase->shift_space = isl_aff_copy(tiling->shift_space); + phase->domain = isl_set_copy(tiling->hex); + + return check_phase(phase); +} + +/* Align the parameters of the elements of "phase" to those of "space". + */ +static __isl_give ppcg_ht_phase *phase_align_params( + __isl_take ppcg_ht_phase *phase, __isl_take isl_space *space) +{ + if (!phase) + goto error; + + phase->time_tile = isl_aff_align_params(phase->time_tile, + isl_space_copy(space)); + phase->local_time = isl_aff_align_params(phase->local_time, + isl_space_copy(space)); + phase->shift_space = isl_aff_align_params(phase->shift_space, + isl_space_copy(space)); + phase->domain = isl_set_align_params(phase->domain, space); + + return check_phase(phase); +error: + isl_space_free(space); + return NULL; +} + +/* Pull back "phase" over "ma". + * That is, take a phase defined over the range of "ma" and + * turn it into a phase defined over the domain of "ma". + */ +static __isl_give ppcg_ht_phase *pullback_phase(__isl_take ppcg_ht_phase *phase, + __isl_take isl_multi_aff *ma) +{ + phase = phase_align_params(phase, isl_multi_aff_get_space(ma)); + if (!phase) + goto error; + + phase->time_tile = isl_aff_pullback_multi_aff(phase->time_tile, + isl_multi_aff_copy(ma)); + phase->local_time = isl_aff_pullback_multi_aff(phase->local_time, + isl_multi_aff_copy(ma)); + phase->shift_space = isl_aff_pullback_multi_aff(phase->shift_space, + isl_multi_aff_copy(ma)); + phase->domain = isl_set_preimage_multi_aff(phase->domain, ma); + + return check_phase(phase); +error: + isl_multi_aff_free(ma); + return NULL; +} + +/* Pullback "phase" over phase->tiling->shift_phase, which shifts + * phase 0 to phase 1. The pullback therefore takes a phase 1 + * description and turns it into a phase 0 description. + */ +static __isl_give ppcg_ht_phase *shift_phase(__isl_take ppcg_ht_phase *phase) +{ + ppcg_ht_tiling *tiling; + + if (!phase) + return NULL; + + tiling = phase->tiling; + return pullback_phase(phase, isl_multi_aff_copy(tiling->shift_phase)); +} + +/* Take a "phase" defined over the ts-space and plug in the projection + * from the input schedule space to the ts-space. + * The result is then defined over this input schedule space. + */ +static __isl_give ppcg_ht_phase *lift_phase(__isl_take ppcg_ht_phase *phase) +{ + ppcg_ht_tiling *tiling; + + if (!phase) + return NULL; + + tiling = phase->tiling; + return pullback_phase(phase, isl_multi_aff_copy(tiling->project_ts)); +} + +/* Compute the shift that should be added to the space band + * in order to be able to apply rectangular tiling to the space. + * Store the shift in phase->space_shift. + * + * In the first dimension, it is equal to shift_space - s. + * For phase 1, this results in + * + * (-(2 * shift_s)*T) % W + * + * In phase 0, the "s" in shift_space has been replaced by "s + shift_s", + * so the result is + * + * shift_s + (-(2 * shift_s)*T) % W + * + * In the other dimensions, the shift is equal to + * + * dl_i * local_time. + */ +static __isl_give ppcg_ht_phase *compute_space_shift( + __isl_take ppcg_ht_phase *phase) +{ + int i, n; + isl_space *space; + isl_local_space *ls; + isl_aff *aff, *s; + isl_multi_aff *space_shift; + + if (!phase) + return NULL; + + space = ppcg_ht_phase_get_input_space(phase); + space = isl_space_unwrap(space); + space = isl_space_range_map(space); + + space_shift = isl_multi_aff_zero(space); + aff = isl_aff_copy(phase->shift_space); + ls = isl_local_space_from_space(isl_aff_get_domain_space(aff)); + s = isl_aff_var_on_domain(ls, isl_dim_set, 1); + aff = isl_aff_sub(aff, s); + space_shift = isl_multi_aff_set_aff(space_shift, 0, aff); + + n = isl_multi_aff_dim(space_shift, isl_dim_out); + for (i = 1; i < n; ++i) { + isl_val *v; + isl_aff *time; + + v = ppcg_ht_bounds_get_lower(phase->tiling->bounds, i); + time = isl_aff_copy(phase->local_time); + time = isl_aff_scale_val(time, v); + space_shift = isl_multi_aff_set_aff(space_shift, i, time); + } + + if (!space_shift) + return ppcg_ht_phase_free(phase); + phase->space_shift = space_shift; + return phase; +} + +/* Compute the space tiling and store the result in phase->space_tile. + * The space tiling is of the form + * + * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size] + */ +static __isl_give ppcg_ht_phase *compute_space_tile( + __isl_take ppcg_ht_phase *phase) +{ + isl_space *space; + isl_multi_val *space_sizes; + isl_multi_aff *space_shift; + isl_multi_aff *tile; + + if (!phase) + return NULL; + + space = ppcg_ht_phase_get_input_space(phase); + space = isl_space_unwrap(space); + tile = isl_multi_aff_range_map(space); + space_shift = isl_multi_aff_copy(phase->space_shift); + tile = isl_multi_aff_add(space_shift, tile); + space_sizes = isl_multi_val_copy(phase->tiling->space_sizes); + tile = isl_multi_aff_scale_down_multi_val(tile, space_sizes); + tile = isl_multi_aff_floor(tile); + + if (!tile) + return ppcg_ht_phase_free(phase); + phase->space_tile = tile; + return phase; +} + +/* Construct a representation for one of the two phase for hybrid tiling + * "tiling". If "shift" is not set, then the phase is constructed + * directly from the hexagonal tile shape in "tiling", which represents + * the phase-1 tiles. If "shift" is set, then this tile shape is shifted + * back over tiling->shift_phase to obtain the phase-0 tiles. + * + * First copy data from "tiling", then optionally shift the phase and + * finally move the tiling from the "ts" space of "tiling" to + * the space of the input pattern. + * + * After the basic phase has been computed, also compute + * the corresponding space shift. + */ +static __isl_give ppcg_ht_phase *ppcg_ht_tiling_compute_phase( + __isl_keep ppcg_ht_tiling *tiling, int shift) +{ + ppcg_ht_phase *phase; + + phase = construct_phase(tiling); + if (shift) + phase = shift_phase(phase); + phase = lift_phase(phase); + + phase = compute_space_shift(phase); + phase = compute_space_tile(phase); + + return phase; +} + +/* Consruct a function that is equal to the time tile of "phase0" + * on the domain of "phase0" and equal to the time tile of "phase1" + * on the domain of "phase1". + * The two domains are assumed to form a partition of the input + * schedule space. + */ +static __isl_give isl_pw_multi_aff *combine_time_tile( + __isl_keep ppcg_ht_phase *phase0, __isl_keep ppcg_ht_phase *phase1) +{ + isl_aff *T; + isl_pw_aff *time, *time1; + + if (!phase0 || !phase1) + return NULL; + + T = isl_aff_copy(phase0->time_tile); + time = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase0), T); + + T = isl_aff_copy(phase1->time_tile); + time1 = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase1), T); + + time = isl_pw_aff_union_add(time, time1); + + return isl_pw_multi_aff_from_pw_aff(time); +} + +/* Name used in mark nodes that contain a pointer to a ppcg_ht_phase. + */ +static char *ppcg_phase_name = "phase"; + +/* Does "id" contain a pointer to a ppcg_ht_phase? + * That is, is it called "phase"? + */ +static isl_bool is_phase_id(__isl_keep isl_id *id) +{ + const char *name; + + name = isl_id_get_name(id); + if (!name) + return isl_bool_error; + + return !strcmp(name, ppcg_phase_name); +} + +/* Given a mark node with an identifier that points to a ppcg_ht_phase, + * extract this ppcg_ht_phase pointer. + */ +__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark( + __isl_keep isl_schedule_node *node) +{ + isl_bool is_phase; + isl_id *id; + void *p; + + if (!node) + return NULL; + if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) + isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, + "not a phase mark", return NULL); + + id = isl_schedule_node_mark_get_id(node); + is_phase = is_phase_id(id); + p = isl_id_get_user(id); + isl_id_free(id); + + if (is_phase < 0) + return NULL; + if (!is_phase) + isl_die(isl_schedule_node_get_ctx(node), isl_error_internal, + "not a phase mark", return NULL); + + return p; +} + +/* Insert a mark node at "node" holding a pointer to "phase". + */ +static __isl_give isl_schedule_node *insert_phase( + __isl_take isl_schedule_node *node, __isl_take ppcg_ht_phase *phase) +{ + isl_ctx *ctx; + isl_id *id; + + if (!node) + goto error; + ctx = isl_schedule_node_get_ctx(node); + id = isl_id_alloc(ctx, ppcg_phase_name, phase); + if (!id) + goto error; + id = isl_id_set_free_user(id, &ppcg_ht_phase_free_wrap); + node = isl_schedule_node_insert_mark(node, id); + + return node; +error: + ppcg_ht_phase_free(phase); + isl_schedule_node_free(node); + return NULL; +} + +/* Construct a mapping from the elements of the original pair of bands + * to which tiling was applied that belong to a tile of "phase" + * to that tile, preserving the values for the outer bands. + * + * The mapping is of the form + * + * [[outer] -> [P -> C]] -> [[outer] -> [tile]] + * + * where tile is defined by a concatenation of the time_tile and + * the space_tile. + */ +static __isl_give isl_map *construct_tile_map(__isl_keep ppcg_ht_phase *phase) +{ + int depth; + isl_space *space; + isl_multi_aff *ma; + isl_multi_aff *tiling; + isl_map *el2tile; + + depth = isl_schedule_node_get_schedule_depth( + phase->tiling->input_node); + space = isl_aff_get_space(phase->time_tile); + space = isl_space_params(space); + space = isl_space_set_from_params(space); + space = isl_space_add_dims(space, isl_dim_set, depth); + space = isl_space_map_from_set(space); + ma = isl_multi_aff_identity(space); + + tiling = isl_multi_aff_flat_range_product( + isl_multi_aff_from_aff(isl_aff_copy(phase->time_tile)), + isl_multi_aff_copy(phase->space_tile)); + el2tile = isl_map_from_multi_aff(tiling); + el2tile = isl_map_intersect_domain(el2tile, + isl_set_copy(phase->domain)); + el2tile = isl_map_product(isl_map_from_multi_aff(ma), el2tile); + + return el2tile; +} + +/* Return a description of the full tiles of "phase" at the point + * in the original schedule tree where the tiling was applied. + * + * First construct a mapping from the input schedule dimensions + * up to an including the original pair of bands to which hybrid tiling + * was applied to schedule dimensions in which this original pair + * has been replaced by the tiles. + * This mapping is of the form + * + * [[outer] -> [P -> C]] -> [[outer] -> [tile]] + * + * Apply this mapping to the set of all values for the input + * schedule dimensions and then apply its inverse. + * The result is the set of values for the input schedule dimensions + * that would map to any of the tiles. Subtracting from this set + * the set of values that are actually executed produces the set + * of values that belong to a tile but that are not executed. + * Mapping these back to the tiles produces a description of + * the partial tiles. Subtracting these from the set of all tiles + * produces a description of the full tiles in the form + * + * [[outer] -> [tile]] + */ +static __isl_give isl_set *compute_full_tile(__isl_keep ppcg_ht_phase *phase) +{ + isl_schedule_node *node; + isl_union_set *domain; + isl_union_map *prefix, *schedule; + isl_set *all, *partial, *all_el; + isl_map *tile2el, *el2tile; + isl_multi_union_pw_aff *mupa; + + el2tile = construct_tile_map(phase); + tile2el = isl_map_reverse(isl_map_copy(el2tile)); + + node = phase->tiling->input_node; + prefix = isl_schedule_node_get_prefix_schedule_union_map(node); + domain = isl_schedule_node_get_domain(node); + mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); + schedule = isl_union_map_from_multi_union_pw_aff(mupa); + schedule = isl_union_map_range_product(prefix, schedule); + all_el = isl_set_from_union_set(isl_union_set_apply(domain, schedule)); + all_el = isl_set_coalesce(all_el); + + all = isl_set_apply(isl_set_copy(all_el), isl_map_copy(el2tile)); + + partial = isl_set_copy(all); + partial = isl_set_apply(partial, tile2el); + partial = isl_set_subtract(partial, all_el); + partial = isl_set_apply(partial, el2tile); + + return isl_set_subtract(all, partial); +} + +/* Copy the AST loop types of the non-isolated part to those + * of the isolated part. + */ +static __isl_give isl_schedule_node *set_isolate_loop_type( + __isl_take isl_schedule_node *node) +{ + int i, n; + + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) { + enum isl_ast_loop_type type; + + type = isl_schedule_node_band_member_get_ast_loop_type(node, i); + node = isl_schedule_node_band_member_set_isolate_ast_loop_type( + node, i, type); + } + + return node; +} + +/* If options->isolate_full_tiles is set, then mark the full tiles + * in "node" for isolation. The full tiles are derived from "phase". + * "node" may point to a part of the tiling, e.g., the space tiling. + * + * The full tiles are originally computed in the form + * + * [[outer] -> [tile]] + * + * However, the band that "node" points to may only contain + * subset of the tile dimensions. + * The description above is therefore treated as + * + * [[outer] -> [before; this; after]] + * + * before is of size "pos"; this is of size "dim"; and + * after is of size "out - pos - dim". + * The after part is first project out. Then the range is split + * into a before and this part and finally the before part is moved + * to the domain, resulting in + * + * [[outer; before] -> [this]] + * + * This description is then used as the isolate option. + * + * The AST loop type for the isolated part is set to be the same + * as that of the non-isolated part. + */ +static __isl_give isl_schedule_node *ppcg_ht_phase_isolate_full_tile_node( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node, + struct ppcg_options *options) +{ + int in, out, pos, depth, dim; + isl_space *space; + isl_multi_aff *ma1, *ma2; + isl_set *tile; + isl_map *map; + isl_set *set; + isl_union_set *opt; + + if (!options->isolate_full_tiles) + return node; + + depth = isl_schedule_node_get_schedule_depth(node); + dim = isl_schedule_node_band_n_member(node); + + tile = compute_full_tile(phase); + map = isl_set_unwrap(tile); + in = isl_map_dim(map, isl_dim_in); + out = isl_map_dim(map, isl_dim_out); + pos = depth - in; + map = isl_map_project_out(map, isl_dim_out, pos + dim, + out - (pos + dim)); + space = isl_space_range(isl_map_get_space(map)); + ma1 = isl_multi_aff_project_out_map(isl_space_copy(space), + isl_dim_set, pos, dim); + ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos); + ma1 = isl_multi_aff_range_product(ma1, ma2); + map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1)); + map = isl_map_uncurry(map); + map = isl_map_flatten_domain(map); + set = isl_map_wrap(map); + set = isl_set_set_tuple_name(set, "isolate"); + + opt = isl_schedule_node_band_get_ast_build_options(node); + opt = isl_union_set_add_set(opt, set); + node = isl_schedule_node_band_set_ast_build_options(node, opt); + node = set_isolate_loop_type(node); + + return node; +} + +/* Insert a band node for performing the space tiling for "phase" at "node". + * In particular, insert a band node with partial schedule + * + * [P[t] -> C[s]] -> C[floor((s + space_shift)/space_size)] + * + * pulled back over the input schedule. + * "options" determines whether full tiles should be separated + * from partial tiles. + * + * The first tile dimension iterates over the hexagons in the same + * phase, which are independent by construction. The first dimension + * is therefore marked coincident. + * All dimensions are also marked for being generated as atomic loops + * because separation is usually not desirable on tile loops. + */ +static __isl_give isl_schedule_node *insert_space_tiling( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node, + struct ppcg_options *options) +{ + isl_multi_aff *space_tile; + isl_multi_union_pw_aff *mupa; + + if (!phase) + return isl_schedule_node_free(node); + + space_tile = isl_multi_aff_copy(phase->space_tile); + mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); + mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_tile); + node = isl_schedule_node_insert_partial_schedule(node, mupa); + node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic); + node = ppcg_ht_phase_isolate_full_tile_node(phase, node, options); + node = isl_schedule_node_band_member_set_coincident(node, 0, 1); + + return node; +} + +/* Given a pointer "node" to (a copy of) the original child node + * in the input pattern, adjust its partial schedule such that + * it starts at zero within each tile. + * + * That is, replace "s" by (s + space_shift) % space_sizes. + */ +__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point( + __isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node) +{ + isl_multi_val *space_sizes; + isl_multi_aff *space_shift; + isl_multi_union_pw_aff *mupa; + + space_shift = isl_multi_aff_copy(phase->space_shift); + mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule); + mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_shift); + node = isl_schedule_node_band_shift(node, mupa); + space_sizes = isl_multi_val_copy(phase->tiling->space_sizes); + node = isl_schedule_node_band_mod(node, space_sizes); + + return node; +} + +/* Does + * + * s0 > delta + 2 * {delta * h} - 1 + * + * hold? + */ +static isl_bool wide_enough(__isl_keep isl_val *s0, __isl_keep isl_val *delta, + __isl_keep isl_val *h) +{ + isl_val *v, *v2; + isl_bool ok; + + v = isl_val_mul(isl_val_copy(delta), isl_val_copy(h)); + v2 = isl_val_floor(isl_val_copy(v)); + v = isl_val_sub(v, v2); + v = isl_val_mul_ui(v, 2); + v = isl_val_add(v, isl_val_copy(delta)); + v = isl_val_sub_ui(v, 1); + ok = isl_val_gt(s0, v); + isl_val_free(v); + + return ok; +} + +/* Is the tile size specified by "sizes" wide enough in the first space + * dimension, i.e., the base of the hexagon? This ensures that, + * after hybrid tiling using "bounds" and these sizes, + * neighboring hexagons in the same phase are far enough apart + * that they do not depend on each other. + * The test is only meaningful if the bounds are valid. + * + * Let st be (half) the size in the time dimension and s0 the base + * size in the first space dimension. Let delta be the dependence + * distance in either positive or negative direction. In principle, + * it should be enough to have s0 + 1 > delta, i.e., s0 >= delta. + * However, in case of fractional delta, the tile is not extended + * with delta * (st - 1), but instead with floor(delta * (st - 1)). + * The condition therefore needs to be adjusted to + * + * s0 + 1 > delta + 2 {delta * (st - 1)} + * + * (with {} the fractional part) to account for the two slanted sides. + * The condition in the paper "Hybrid Hexagonal/Classical Tiling for GPUs" + * translates to + * + * s0 >= delta + {delta * (st - 1)} + * + * Since 1 > frac(delta * (st - 1)), this condition implies + * the condition above. + * + * The condition is checked for both directions. + */ +isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds, + __isl_keep isl_multi_val *sizes) +{ + isl_val *s0, *h; + isl_val *delta; + isl_bool ok; + + ok = ppcg_ht_bounds_is_valid(bounds); + if (ok < 0 || !ok) + return ok; + + h = isl_val_sub_ui(isl_multi_val_get_val(sizes, 0), 1); + s0 = isl_multi_val_get_val(sizes, 1); + + delta = ppcg_ht_bounds_get_lower(bounds, 0); + ok = wide_enough(s0, delta, h); + isl_val_free(delta); + + delta = ppcg_ht_bounds_get_upper(bounds); + if (ok == isl_bool_true) + ok = wide_enough(s0, delta, h); + isl_val_free(delta); + + isl_val_free(s0); + isl_val_free(h); + + return ok; +} + +/* Check that the tile will be wide enough in the first space + * dimension, i.e., the base of the hexagon. This ensures that + * neighboring hexagons in the same phase are far enough apart + * that they do not depend on each other. + * + * Error out if the condition fails to hold. + */ +static isl_stat check_width(__isl_keep ppcg_ht_bounds *bounds, + __isl_keep isl_multi_val *sizes) +{ + isl_bool ok; + + ok = ppcg_ht_bounds_supports_sizes(bounds, sizes); + + if (ok < 0) + return isl_stat_error; + if (!ok) + isl_die(isl_multi_val_get_ctx(sizes), isl_error_invalid, + "base of hybrid tiling hexagon not sufficiently wide", + return isl_stat_error); + + return isl_stat_ok; +} + +/* Given valid bounds on the relative dependence distances for + * the pair of nested nodes that "node" point to, as well as sufficiently + * wide tile sizes "sizes", insert the corresponding time and space tiling + * at "node", along with a pair of phase nodes that can be used + * to make further changes. + * The space of "sizes" should be the product of the spaces + * of the schedules of the pair of parent and child nodes. + * "options" determines whether full tiles should be separated + * from partial tiles. + * + * In particular, given an input of the form + * + * P - C - ... + * + * the output has the form + * + * /- F0 - M0 - CT0 - P - C - ... + * PT - seq + * \- F1 - M1 - CT1 - P - C - ... + * + * PT is the global time tiling. Within each of these tiles, + * two phases are executed in order. Within each phase, the schedule + * space is further subdivided into tiles through CT0 and CT1. + * The first dimension of each of these iterates over the hexagons + * within a phase and these are independent by construction. + * The F0 and F1 filters filter the statement instances that belong + * to the corresponding phase. The M0 and M1 marks contain a pointer + * to a ppcg_ht_phase object that can be used to perform further changes. + * + * After checking that input satisfies the requirements, + * a data structure is constructed that represents the tiling and + * two additional data structures are constructed for the two phases + * of the tiling. These are then used to define the filters F0 and F1 and + * combined to construct the time tiling PT. + * Then the time tiling node PT is inserted, followed by + * the sequence with the two filters, the CT space tiling nodes and + * the phase markers M. + */ +__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling( + __isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes, + __isl_take isl_schedule_node *node, struct ppcg_options *options) +{ + isl_ctx *ctx; + isl_union_set *phase0; + isl_union_set *phase1; + isl_multi_union_pw_aff *input, *dom_time; + isl_union_pw_multi_aff *upma; + isl_pw_multi_aff *time; + isl_union_set_list *phases; + ppcg_ht_tiling *tiling; + ppcg_ht_phase *phase_0; + ppcg_ht_phase *phase_1; + + if (!node || !sizes || !bounds) + goto error; + if (check_input_pattern(node) < 0 || check_width(bounds, sizes) < 0) + goto error; + + ctx = isl_schedule_node_get_ctx(node); + + input = extract_input_schedule(node); + + tiling = ppcg_ht_bounds_construct_tiling(bounds, node, input, sizes); + phase_0 = ppcg_ht_tiling_compute_phase(tiling, 1); + phase_1 = ppcg_ht_tiling_compute_phase(tiling, 0); + time = combine_time_tile(phase_0, phase_1); + ppcg_ht_tiling_free(tiling); + + upma = isl_union_pw_multi_aff_from_multi_union_pw_aff( + isl_multi_union_pw_aff_copy(input)); + phase0 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_0)); + phase0 = isl_union_set_preimage_union_pw_multi_aff(phase0, + isl_union_pw_multi_aff_copy(upma)); + phase1 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_1)); + phase1 = isl_union_set_preimage_union_pw_multi_aff(phase1, upma); + + phases = isl_union_set_list_alloc(ctx, 2); + phases = isl_union_set_list_add(phases, phase0); + phases = isl_union_set_list_add(phases, phase1); + + dom_time = isl_multi_union_pw_aff_apply_pw_multi_aff(input, time); + node = isl_schedule_node_insert_partial_schedule(node, dom_time); + + node = isl_schedule_node_child(node, 0); + + node = isl_schedule_node_insert_sequence(node, phases); + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + node = insert_space_tiling(phase_0, node, options); + node = insert_phase(node, phase_0); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_next_sibling(node); + node = isl_schedule_node_child(node, 0); + node = insert_space_tiling(phase_1, node, options); + node = insert_phase(node, phase_1); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_parent(node); + + node = isl_schedule_node_parent(node); + + isl_multi_val_free(sizes); + return node; +error: + isl_multi_val_free(sizes); + isl_schedule_node_free(node); + ppcg_ht_bounds_free(bounds); + return NULL; +} + +/* Given a branch "node" that contains a sequence node with two phases + * of hybrid tiling as input, call "fn" on each of the two phase marker + * nodes. + * + * That is, the input is as follows + * + * /- F0 - M0 - ... + * ... - seq + * \- F1 - M1 - ... + * + * and "fn" is called on M0 and on M1. + */ +__isl_give isl_schedule_node *hybrid_tile_foreach_phase( + __isl_take isl_schedule_node *node, + __isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node, + void *user), void *user) +{ + int depth0, depth; + + depth0 = isl_schedule_node_get_tree_depth(node); + + while (node && + isl_schedule_node_get_type(node) != isl_schedule_node_sequence) + node = isl_schedule_node_child(node, 0); + + node = isl_schedule_node_child(node, 0); + node = isl_schedule_node_child(node, 0); + if (!node) + return NULL; + node = fn(node, user); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_next_sibling(node); + node = isl_schedule_node_child(node, 0); + if (!node) + return NULL; + node = fn(node, user); + node = isl_schedule_node_parent(node); + node = isl_schedule_node_parent(node); + + depth = isl_schedule_node_get_tree_depth(node); + node = isl_schedule_node_ancestor(node, depth - depth0); + + return node; +} + +/* This function is called on each of the two phase marks + * in a hybrid tiling tree. + * Drop the phase mark at "node". + */ +static __isl_give isl_schedule_node *drop_phase_mark( + __isl_take isl_schedule_node *node, void *user) +{ + isl_id *id; + isl_bool is_phase; + + if (isl_schedule_node_get_type(node) != isl_schedule_node_mark) + return node; + + id = isl_schedule_node_mark_get_id(node); + is_phase = is_phase_id(id); + isl_id_free(id); + + if (is_phase < 0) + return isl_schedule_node_free(node); + if (is_phase) + node = isl_schedule_node_delete(node); + + return node; +} + +/* Given a branch "node" that contains a sequence node with two phases + * of hybrid tiling as input, remove the two phase marker nodes. + * + * That is, the input is as follows + * + * /- F0 - M0 - ... + * ... - seq + * \- F1 - M1 - ... + * + * and the output is + * + * /- F0 - ... + * ... - seq + * \- F1 - ... + */ +__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks( + __isl_take isl_schedule_node *node) +{ + return hybrid_tile_foreach_phase(node, &drop_phase_mark, NULL); +} Index: polly/trunk/lib/External/ppcg/opencl.c =================================================================== --- polly/trunk/lib/External/ppcg/opencl.c +++ polly/trunk/lib/External/ppcg/opencl.c @@ -216,8 +216,6 @@ p = isl_printer_print_str(p, macros); p = isl_printer_end_line(p); - p = isl_ast_op_type_print_macro(isl_ast_op_max, p); - return p; } @@ -264,6 +262,11 @@ { int need_lower_bound; + need_lower_bound = !is_array_positive_size_guard_trivial(array); + if (need_lower_bound) + p = ppcg_print_macro(isl_ast_op_max, p); + + p = ppcg_ast_expr_print_macros(array->bound_expr, p); p = ppcg_start_block(p); p = isl_printer_start_line(p); @@ -272,9 +275,9 @@ p = isl_printer_print_str(p, " = clCreateBuffer(context, "); p = isl_printer_print_str(p, "CL_MEM_READ_WRITE, "); - need_lower_bound = !is_array_positive_size_guard_trivial(array); if (need_lower_bound) { - p = isl_printer_print_str(p, "max(sizeof("); + p = isl_printer_print_str(p, ppcg_max); + p = isl_printer_print_str(p, "(sizeof("); p = isl_printer_print_str(p, array->type); p = isl_printer_print_str(p, "), "); } @@ -313,6 +316,124 @@ return p; } +/* Free the device array corresponding to "array" + */ +static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p, + struct gpu_array_info *array) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "openclCheckReturn(" + "clReleaseMemObject(dev_"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, "));"); + p = isl_printer_end_line(p); + + return p; +} + +/* Free the accessed device arrays. + */ +static __isl_give isl_printer *opencl_release_device_arrays( + __isl_take isl_printer *p, struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + struct gpu_array_info *array = &prog->array[i]; + if (!gpu_array_requires_device_allocation(array)) + continue; + + p = release_device_array(p, array); + } + return p; +} + +/* Create an OpenCL device, context, command queue and build the kernel. + * input is the name of the input file provided to ppcg. + */ +static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p, + const char *input, struct opencl_info *info) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cl_device_id device;"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cl_context context;"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cl_program program;"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cl_command_queue queue;"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cl_int err;"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "device = opencl_create_device("); + p = isl_printer_print_int(p, info->options->opencl_use_gpu); + p = isl_printer_print_str(p, ");"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1, " + "&device, NULL, NULL, &err);"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "openclCheckReturn(err);"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "queue = clCreateCommandQueue" + "(context, device, 0, &err);"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "openclCheckReturn(err);"); + p = isl_printer_end_line(p); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "program = "); + + if (info->options->opencl_embed_kernel_code) { + p = isl_printer_print_str(p, "opencl_build_program_from_string(" + "context, device, kernel_code, " + "sizeof(kernel_code), \""); + } else { + p = isl_printer_print_str(p, "opencl_build_program_from_file(" + "context, device, \""); + p = isl_printer_print_str(p, info->kernel_c_name); + p = isl_printer_print_str(p, "\", \""); + } + + if (info->options->opencl_compiler_options) + p = isl_printer_print_str(p, + info->options->opencl_compiler_options); + + p = isl_printer_print_str(p, "\");"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + + return p; +} + +static __isl_give isl_printer *opencl_release_cl_objects( + __isl_take isl_printer *p, struct opencl_info *info) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue" + "(queue));"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram" + "(program));"); + p = isl_printer_end_line(p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext" + "(context));"); + p = isl_printer_end_line(p); + + return p; +} + /* Print a call to the OpenCL clSetKernelArg() function which sets * the arguments of the kernel. arg_name and arg_index are the name and the * index of the kernel argument. The index of the leftmost argument of @@ -761,6 +882,26 @@ return p; } +/* Macro definitions for ppcg_min and ppcg_max for use + * in OpenCL kernel code. + * These macro definitions essentially call the corresponding + * OpenCL macros/functions, but first ensure that the two arguments + * have the same type, since the OpenCL versions are only defined + * in case those arguments have the same type. + */ +static const char *opencl_min = + "(x,y) min((__typeof__(x + y)) x, (__typeof__(x + y)) y)"; +static const char *opencl_max = + "(x,y) max((__typeof__(x + y)) x, (__typeof__(x + y)) y)"; + +/* Set the macro definitions for ppcg_min and ppcg_max to + * OpenCL specific versions. + */ +static __isl_give isl_printer *set_opencl_macros(__isl_take isl_printer *p) +{ + return ppcg_set_macros(p, opencl_min, opencl_max); +} + static __isl_give isl_printer *opencl_print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel, __isl_take isl_printer *p) { @@ -779,8 +920,9 @@ p = opencl_print_kernel_iterators(p, kernel); p = opencl_print_kernel_vars(p, kernel); p = isl_printer_end_line(p); - p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p); - p = ppcg_print_macros(p, kernel->tree); + p = ppcg_set_macro_names(p); + p = set_opencl_macros(p); + p = gpu_print_macros(p, kernel->tree); p = isl_ast_node_print(kernel->tree, p, print_options); p = isl_printer_indent(p, -4); p = isl_printer_start_line(p); @@ -813,24 +955,27 @@ __isl_take isl_printer *p, struct ppcg_kernel *kernel, int i) { int grid_dim, block_dim; - isl_pw_aff *bound_grid; + isl_ast_expr *grid_size_expr; + isl_ast_expr *bound_grid; grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set); block_dim = kernel->n_block; if (i < min(grid_dim, block_dim)) { - bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i); + grid_size_expr = kernel->grid_size_expr; + bound_grid = isl_ast_expr_get_op_arg(grid_size_expr, 1 + i); p = isl_printer_print_str(p, "("); - p = isl_printer_print_pw_aff(p, bound_grid); + p = isl_printer_print_ast_expr(p, bound_grid); p = isl_printer_print_str(p, ") * "); p = isl_printer_print_int(p, kernel->block_dim[i]); - isl_pw_aff_free(bound_grid); - } else if (i >= grid_dim) + isl_ast_expr_free(bound_grid); + } else if (i >= grid_dim) { p = isl_printer_print_int(p, kernel->block_dim[i]); - else { - bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i); - p = isl_printer_print_pw_aff(p, bound_grid); - isl_pw_aff_free(bound_grid); + } else { + grid_size_expr = kernel->grid_size_expr; + bound_grid = isl_ast_expr_get_op_arg(grid_size_expr, 1 + i); + p = isl_printer_print_ast_expr(p, bound_grid); + isl_ast_expr_free(bound_grid); } return p; @@ -907,16 +1052,50 @@ return p; } -/* Print a statement for copying an array to or from the device. - * The statement identifier is called "to_device_" or - * "from_device_" and its user pointer points - * to the gpu_array_info of the array that needs to be copied. - * - * Extract the array from the identifier and call - * copy_array_to_device or copy_array_from_device. +/* Print code for initializing the device for execution of the transformed + * code. This includes declaring locally defined variables as well as + * declaring and allocating the required copies of arrays on the device. + */ +static __isl_give isl_printer *init_device(__isl_take isl_printer *p, + struct gpu_prog *prog, struct opencl_info *opencl) +{ + p = opencl_print_host_macros(p); + + p = gpu_print_local_declarations(p, prog); + p = opencl_declare_device_arrays(p, prog); + p = opencl_setup(p, opencl->input, opencl); + p = opencl_allocate_device_arrays(p, prog); + + return p; +} + +/* Print code for clearing the device after execution of the transformed code. + * In particular, free the memory that was allocated on the device. */ -static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p, - __isl_keep isl_ast_node *node, struct gpu_prog *prog) +static __isl_give isl_printer *clear_device(__isl_take isl_printer *p, + struct gpu_prog *prog, struct opencl_info *opencl) +{ + p = opencl_release_device_arrays(p, prog); + p = opencl_release_cl_objects(p, opencl); + + return p; +} + +/* Print a statement for copying an array to or from the device, + * or for initializing or clearing the device. + * The statement identifier of a copying node is called + * "to_device_" or "from_device_" and + * its user pointer points to the gpu_array_info of the array + * that needs to be copied. + * The node for initializing the device is called "init_device". + * The node for clearing the device is called "clear_device". + * + * Extract the array (if any) from the identifier and call + * init_device, clear_device, copy_array_to_device or copy_array_from_device. + */ +static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node, struct gpu_prog *prog, + struct opencl_info *opencl) { isl_ast_expr *expr, *arg; isl_id *id; @@ -933,7 +1112,11 @@ isl_ast_expr_free(expr); if (!name) - array = NULL; + return isl_printer_free(p); + if (!strcmp(name, "init_device")) + return init_device(p, prog, opencl); + if (!strcmp(name, "clear_device")) + return clear_device(p, prog, opencl); if (!array) return isl_printer_free(p); @@ -945,11 +1128,12 @@ /* Print the user statement of the host code to "p". * - * The host code may contain original user statements, kernel launches and - * statements that copy data to/from the device. + * The host code may contain original user statements, kernel launches, + * statements that copy data to/from the device and statements + * the initialize or clear the device. * The original user statements and the kernel launches have - * an associated annotation, while the data copy statements do not. - * The latter are handled by print_to_from_device. + * an associated annotation, while the other statements do not. + * The latter are handled by print_device_node. * The annotation on the user statements is called "user". * * In case of a kernel launch, print a block of statements that @@ -989,7 +1173,7 @@ id = isl_ast_node_get_annotation(node); if (!id) - return print_to_from_device(p, node, data->prog); + return print_device_node(p, node, data->prog, data->opencl); is_user = !strcmp(isl_id_get_name(id), "user"); kernel = is_user ? NULL : isl_id_get_user(id); @@ -1092,130 +1276,12 @@ print_options = isl_ast_print_options_set_print_user(print_options, &opencl_print_host_user, &data); - p = ppcg_print_macros(p, tree); + p = gpu_print_macros(p, tree); p = isl_ast_node_print(tree, p, print_options); return p; } -/* Create an OpenCL device, context, command queue and build the kernel. - * input is the name of the input file provided to ppcg. - */ -static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p, - const char *input, struct opencl_info *info) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cl_device_id device;"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cl_context context;"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cl_program program;"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cl_command_queue queue;"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "cl_int err;"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "device = opencl_create_device("); - p = isl_printer_print_int(p, info->options->opencl_use_gpu); - p = isl_printer_print_str(p, ");"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1, " - "&device, NULL, NULL, &err);"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "openclCheckReturn(err);"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "queue = clCreateCommandQueue" - "(context, device, 0, &err);"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "openclCheckReturn(err);"); - p = isl_printer_end_line(p); - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "program = "); - - if (info->options->opencl_embed_kernel_code) { - p = isl_printer_print_str(p, "opencl_build_program_from_string(" - "context, device, kernel_code, " - "sizeof(kernel_code), \""); - } else { - p = isl_printer_print_str(p, "opencl_build_program_from_file(" - "context, device, \""); - p = isl_printer_print_str(p, info->kernel_c_name); - p = isl_printer_print_str(p, "\", \""); - } - - if (info->options->opencl_compiler_options) - p = isl_printer_print_str(p, - info->options->opencl_compiler_options); - - p = isl_printer_print_str(p, "\");"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_end_line(p); - - return p; -} - -static __isl_give isl_printer *opencl_release_cl_objects( - __isl_take isl_printer *p, struct opencl_info *info) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue" - "(queue));"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram" - "(program));"); - p = isl_printer_end_line(p); - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext" - "(context));"); - p = isl_printer_end_line(p); - - return p; -} - -/* Free the device array corresponding to "array" - */ -static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p, - struct gpu_array_info *array) -{ - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, "openclCheckReturn(" - "clReleaseMemObject(dev_"); - p = isl_printer_print_str(p, array->name); - p = isl_printer_print_str(p, "));"); - p = isl_printer_end_line(p); - - return p; -} - -/* Free the accessed device arrays. - */ -static __isl_give isl_printer *opencl_release_device_arrays( - __isl_take isl_printer *p, struct gpu_prog *prog) -{ - int i; - - for (i = 0; i < prog->n_array; ++i) { - struct gpu_array_info *array = &prog->array[i]; - if (!gpu_array_requires_device_allocation(array)) - continue; - - p = release_device_array(p, array); - } - return p; -} - /* Given a gpu_prog "prog" and the corresponding transformed AST * "tree", print the entire OpenCL code to "p". */ @@ -1237,22 +1303,8 @@ if (!opencl->kprinter) return isl_printer_free(p); - p = ppcg_start_block(p); - - p = opencl_print_host_macros(p); - - p = gpu_print_local_declarations(p, prog); - p = opencl_declare_device_arrays(p, prog); - p = opencl_setup(p, opencl->input, opencl); - p = opencl_allocate_device_arrays(p, prog); - p = opencl_print_host_code(p, prog, tree, opencl); - p = opencl_release_device_arrays(p, prog); - p = opencl_release_cl_objects(p, opencl); - - p = ppcg_end_block(p); - return p; } Index: polly/trunk/lib/External/ppcg/opencl_test.sh.in =================================================================== --- polly/trunk/lib/External/ppcg/opencl_test.sh.in +++ polly/trunk/lib/External/ppcg/opencl_test.sh.in @@ -54,6 +54,25 @@ run_tests default run_tests embed --opencl-embed-kernel-code +for i in $srcdir/examples/*.c; do + echo $i + name=`basename $i` + name="${name%.c}" + exe_ref="${OUTDIR}/$name.ref$EXEEXT" + gen_ocl="${OUTDIR}/$name.ppcg.c" + exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT" + output_ref="${OUTDIR}/$name.ref.out" + output_ocl="${OUTDIR}/$name.ppcg.out" + $CC $CFLAGS $i -o $exe_ref || exit + ./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \ + exit + $CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \ + "$gen_ocl" -o "$exe_ocl" || exit + $exe_ref > $output_ref || exit + $exe_ocl > $output_ocl || exit + cmp $output_ref $output_ocl || exit +done + if [ $keep = "no" ]; then rm -r "${OUTDIR}" fi Index: polly/trunk/lib/External/ppcg/polybench_test.sh.in =================================================================== --- polly/trunk/lib/External/ppcg/polybench_test.sh.in +++ polly/trunk/lib/External/ppcg/polybench_test.sh.in @@ -88,8 +88,8 @@ done } -run_tests ppcg --target=c -run_tests ppcg_live "--target=c --no-live-range-reordering" +run_tests ppcg "--target=c --tile" +run_tests ppcg_live "--target=c --no-live-range-reordering --tile" # Test OpenMP code, if compiler supports openmp if [ $HAVE_OPENMP = "yes" ]; then Index: polly/trunk/lib/External/ppcg/ppcg.h =================================================================== --- polly/trunk/lib/External/ppcg/ppcg.h +++ polly/trunk/lib/External/ppcg/ppcg.h @@ -37,8 +37,10 @@ * to a reference identifier * "live_out" contains the potential write accesses that are potentially * not killed by any kills or any other writes. - * "tagged_must_kills" contains all definite kill accesses with - * a reference identifier in the domain. + * "must_kills" contains all definite kill accesses. + * "tagged_must_kills" is the same as "must_kills", except that the domain + * is a wrapped relation mapping an iteration domain + * to a reference identifier. * * "tagger" maps tagged iteration domains to the corresponding untagged * iteration domain. @@ -87,6 +89,7 @@ isl_union_map *must_writes; isl_union_map *live_out; isl_union_map *tagged_must_kills; + isl_union_map *must_kills; isl_union_pw_multi_aff *tagger; @@ -114,8 +117,8 @@ __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, struct ppcg_scop *scop, void *user), void *user); -void compute_tagger(struct ppcg_scop *ps); -void compute_dependences(struct ppcg_scop *scop); -void *ppcg_scop_free(struct ppcg_scop *ps); +__isl_give isl_schedule *ppcg_compute_schedule( + __isl_take isl_schedule_constraints *sc, + __isl_keep isl_schedule *schedule, struct ppcg_options *options); #endif Index: polly/trunk/lib/External/ppcg/ppcg.c =================================================================== --- polly/trunk/lib/External/ppcg/ppcg.c +++ polly/trunk/lib/External/ppcg/ppcg.c @@ -16,9 +16,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include +#include +#include #include #include #include @@ -29,7 +37,6 @@ #include "cpu.h" struct options { - struct isl_options *isl; struct pet_options *pet; struct ppcg_options *ppcg; char *input; @@ -43,7 +50,6 @@ } ISL_ARGS_START(struct options, options_args) -ISL_ARG_CHILD(struct options, isl, "isl", &isl_options_args, "isl options") ISL_ARG_CHILD(struct options, pet, "pet", &pet_options_args, "pet options") ISL_ARG_CHILD(struct options, ppcg, NULL, &ppcg_options_args, "ppcg options") ISL_ARG_STR(struct options, output, 'o', NULL, @@ -99,8 +105,6 @@ if (!scop) return 0; - // This is a pet feature not available in Polly. - return 0; for (i = 0; i < scop->pet->n_array; ++i) if (scop->pet->arrays[i]->declared && !scop->pet->arrays[i]->exposed) @@ -186,7 +190,6 @@ int n, const char *prefix) { int i; - char name[10]; isl_ctx *ctx; isl_id_list *names; @@ -338,7 +341,7 @@ * * { [S[i,j] -> R_1[]] -> S[i,j]; [S[i,j] -> R_2[]] -> S[i,j] } */ -void compute_tagger(struct ppcg_scop *ps) +static void compute_tagger(struct ppcg_scop *ps) { isl_union_map *tagged; isl_union_pw_multi_aff *tagger; @@ -362,12 +365,25 @@ * * We compute the "dependence" of any "kill" (an explicit kill * or a must write) on any may write. - * The may writes with a "depending" kill are definitely killed. + * The elements accessed by the may writes with a "depending" kill + * also accessing the element are definitely killed. * The remaining may writes can potentially be live out. + * + * The result of the dependence analysis is + * + * { IW -> [IK -> A] } + * + * with IW the instance of the write statement, IK the instance of kill + * statement and A the element that was killed. + * The range factor range is + * + * { IW -> A } + * + * containing all such pairs for which there is a kill statement instance, + * i.e., all pairs that have been killed. */ static void compute_live_out(struct ppcg_scop *ps) { - isl_union_pw_multi_aff *tagger; isl_schedule *schedule; isl_union_map *kills; isl_union_map *exposed; @@ -375,22 +391,21 @@ isl_union_access_info *access; isl_union_flow *flow; - tagger = isl_union_pw_multi_aff_copy(ps->tagger); schedule = isl_schedule_copy(ps->schedule); - schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger); - kills = isl_union_map_union(isl_union_map_copy(ps->tagged_must_writes), - isl_union_map_copy(ps->tagged_must_kills)); + kills = isl_union_map_union(isl_union_map_copy(ps->must_writes), + isl_union_map_copy(ps->must_kills)); access = isl_union_access_info_from_sink(kills); access = isl_union_access_info_set_may_source(access, - isl_union_map_copy(ps->tagged_may_writes)); + isl_union_map_copy(ps->may_writes)); access = isl_union_access_info_set_schedule(access, schedule); flow = isl_union_access_info_compute_flow(access); - covering = isl_union_flow_get_may_dependence(flow); + covering = isl_union_flow_get_full_may_dependence(flow); isl_union_flow_free(flow); - exposed = isl_union_map_copy(ps->tagged_may_writes); - exposed = isl_union_map_subtract_domain(exposed, - isl_union_map_domain(covering)); - ps->live_out = project_out_tags(exposed); + + covering = isl_union_map_range_factor_range(covering); + exposed = isl_union_map_copy(ps->may_writes); + exposed = isl_union_map_subtract(exposed, covering); + ps->live_out = exposed; } /* Compute the tagged flow dependences and the live_in accesses and store @@ -696,7 +711,7 @@ * set of order dependences and a set of external false dependences * in compute_live_range_reordering_dependences. */ -void compute_dependences(struct ppcg_scop *scop) +static void compute_dependences(struct ppcg_scop *scop) { isl_union_map *may_source; isl_union_access_info *access; @@ -815,7 +830,7 @@ return set; } -void *ppcg_scop_free(struct ppcg_scop *ps) +static void *ppcg_scop_free(struct ppcg_scop *ps) { if (!ps) return NULL; @@ -832,6 +847,7 @@ isl_union_map_free(ps->must_writes); isl_union_map_free(ps->live_out); isl_union_map_free(ps->tagged_must_kills); + isl_union_map_free(ps->must_kills); isl_union_map_free(ps->tagged_dep_flow); isl_union_map_free(ps->dep_flow); isl_union_map_free(ps->dep_false); @@ -882,13 +898,14 @@ } ps->domain = collect_non_kill_domains(scop); ps->call = collect_call_domains(scop); - ps->tagged_reads = pet_scop_collect_tagged_may_reads(scop); - ps->reads = pet_scop_collect_may_reads(scop); - ps->tagged_may_writes = pet_scop_collect_tagged_may_writes(scop); - ps->may_writes = pet_scop_collect_may_writes(scop); - ps->tagged_must_writes = pet_scop_collect_tagged_must_writes(scop); - ps->must_writes = pet_scop_collect_must_writes(scop); - ps->tagged_must_kills = pet_scop_collect_tagged_must_kills(scop); + ps->tagged_reads = pet_scop_get_tagged_may_reads(scop); + ps->reads = pet_scop_get_may_reads(scop); + ps->tagged_may_writes = pet_scop_get_tagged_may_writes(scop); + ps->may_writes = pet_scop_get_may_writes(scop); + ps->tagged_must_writes = pet_scop_get_tagged_must_writes(scop); + ps->must_writes = pet_scop_get_must_writes(scop); + ps->tagged_must_kills = pet_scop_get_tagged_must_kills(scop); + ps->must_kills = pet_scop_get_must_kills(scop); ps->schedule = isl_schedule_copy(scop->schedule); ps->pet = scop; ps->independence = isl_union_map_empty(isl_set_get_space(ps->context)); @@ -902,7 +919,7 @@ if (!ps->context || !ps->domain || !ps->call || !ps->reads || !ps->may_writes || !ps->must_writes || !ps->tagged_must_kills || - !ps->schedule || !ps->independence || !ps->names) + !ps->must_kills || !ps->schedule || !ps->independence || !ps->names) return ppcg_scop_free(ps); return ps; @@ -1009,7 +1026,6 @@ return 0; } -#if 0 int main(int argc, char **argv) { int r; @@ -1020,8 +1036,12 @@ assert(options); ctx = isl_ctx_alloc_with_options(&options_args, options); - isl_options_set_schedule_outer_coincidence(ctx, 1); + ppcg_options_set_target_defaults(options->ppcg); + isl_options_set_ast_build_detect_min_max(ctx, 1); + isl_options_set_ast_print_macro_once(ctx, 1); + isl_options_set_schedule_whole_component(ctx, 0); isl_options_set_schedule_maximize_band_depth(ctx, 1); + isl_options_set_schedule_maximize_coincidence(ctx, 1); pet_options_set_encapsulate_dynamic_control(ctx, 1); argc = options_parse(options, argc, argv, ISL_ARG_ALL); @@ -1040,4 +1060,3 @@ return r; } -#endif Index: polly/trunk/lib/External/ppcg/ppcg_options.h =================================================================== --- polly/trunk/lib/External/ppcg/ppcg_options.h +++ polly/trunk/lib/External/ppcg/ppcg_options.h @@ -2,6 +2,7 @@ #define PPCG_OPTIONS_H #include +#include struct ppcg_debug_options { int dump_schedule_constraints; @@ -12,8 +13,12 @@ }; struct ppcg_options { + struct isl_options *isl; struct ppcg_debug_options *debug; + /* Group chains of consecutive statements before scheduling. */ + int group_chains; + /* Use isl to compute a schedule replacing the original schedule. */ int reschedule; int scale_tile_loops; @@ -24,8 +29,13 @@ char *ctx; char *sizes; + /* Perform tiling (C target). */ + int tile; int tile_size; + /* Isolate full tiles from partial tiles. */ + int isolate_full_tiles; + /* Take advantage of private memory. */ int use_private_memory; @@ -44,9 +54,20 @@ /* Linearize all device arrays. */ int linearize_device_arrays; + /* Allow the use of GNU extensions in generated code. */ + int allow_gnu_extensions; + /* Allow live range to be reordered. */ int live_range_reordering; + /* Allow hybrid tiling whenever a suitable input pattern is found. */ + int hybrid; + + /* Unroll the code for copying to/from shared memory. */ + int unroll_copy_shared; + /* Unroll code inside tile on GPU targets. */ + int unroll_gpu_tile; + /* Options to pass to the OpenCL compiler. */ char *opencl_compiler_options; /* Prefer GPU device over CPU. */ @@ -74,4 +95,6 @@ #define PPCG_TARGET_CUDA 1 #define PPCG_TARGET_OPENCL 2 +void ppcg_options_set_target_defaults(struct ppcg_options *options); + #endif Index: polly/trunk/lib/External/ppcg/ppcg_options.c =================================================================== --- polly/trunk/lib/External/ppcg/ppcg_options.c +++ polly/trunk/lib/External/ppcg/ppcg_options.c @@ -17,6 +17,36 @@ {0} }; +/* Set defaults that depend on the target. + * In particular, set --schedule-outer-coincidence iff target is a GPU. + */ +void ppcg_options_set_target_defaults(struct ppcg_options *options) +{ + char *argv[2] = { NULL }; + + argv[0] = "ppcg_options_set_target_defaults"; + if (options->target == PPCG_TARGET_C) + argv[1] = "--no-schedule-outer-coincidence"; + else + argv[1] = "--schedule-outer-coincidence"; + + isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL); +} + +/* Callback that is called whenever the "target" option is set (to "val"). + * The callback is called after target has been updated. + * + * Call ppcg_options_set_target_defaults to reset the target-dependent options. + */ +static int set_target(void *opt, unsigned val) +{ + struct ppcg_options *options = opt; + + ppcg_options_set_target_defaults(options); + + return 0; +} + ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args) ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0, "dump-schedule-constraints", 0, "dump schedule constraints") @@ -46,10 +76,14 @@ ISL_ARGS_END ISL_ARGS_START(struct ppcg_options, ppcg_options_args) +ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options") ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args, "debugging options") +ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1, + "group chains of interdependent statements that are executed " + "consecutively in the original schedule before scheduling") ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1, - "replace original schedule by isl computed schedule (except C target)") + "replace original schedule by isl computed schedule") ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0, "scale-tile-loops", 1, NULL) ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL) @@ -62,22 +96,37 @@ ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0, "assume-non-negative-parameters", 0, "assume all parameters are non-negative)") +ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0, + "perform tiling (C target)") ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL) +ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles", + 0, "isolate full tiles from partial tiles (hybrid tiling)") ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL, "Per kernel tile, grid and block sizes") ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0, "max-shared-memory", "size", 8192, "maximal amount of shared memory") ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0, "Generate OpenMP macros (only for C target)") -ISL_ARG_CHOICE(struct ppcg_options, target, 0, "target", target, - PPCG_TARGET_CUDA, "the target to generate code for") +ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target, + &set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA, + "the target to generate code for") ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0, "linearize-device-arrays", 1, "linearize all device arrays, even those of fixed size") +ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0, + "allow-gnu-extensions", 1, + "allow the use of GNU extensions in generated code") ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0, "live-range-reordering", 1, "allow successive live ranges on the same memory element " "to be reordered") +ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0, + "apply hybrid tiling whenever a suitable input pattern is found " + "(GPU targets)") +ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared", + 0, "unroll code for copying to/from shared memory") +ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0, + "unroll code inside tile on GPU targets") ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options") ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule", "file", NULL, "save isl computed schedule to ") Index: polly/trunk/lib/External/ppcg/print.h =================================================================== --- polly/trunk/lib/External/ppcg/print.h +++ polly/trunk/lib/External/ppcg/print.h @@ -5,12 +5,31 @@ #include "ppcg.h" +extern const char *ppcg_min; +extern const char *ppcg_max; +extern const char *ppcg_fdiv_q; + __isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p); __isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p); +__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p); +__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p, + const char *min, const char *max); +__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type, + __isl_take isl_printer *p); +__isl_give isl_printer *ppcg_ast_expr_print_macros( + __isl_keep isl_ast_expr *expr, __isl_take isl_printer *p); +__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p, + __isl_keep isl_id_to_ast_expr *ref2expr); __isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p, __isl_keep isl_ast_node *node); +__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size, + __isl_keep isl_ast_build *build); + +__isl_give isl_printer *ppcg_print_declaration_with_size( + __isl_take isl_printer *p, const char *base_type, + __isl_keep isl_ast_expr *size); __isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p, struct pet_array *array, __isl_keep isl_ast_build *build); __isl_give isl_printer *ppcg_print_exposed_declarations( @@ -18,9 +37,4 @@ __isl_give isl_printer *ppcg_print_hidden_declarations( __isl_take isl_printer *p, struct ppcg_scop *scop); -__isl_give isl_printer *ppcg_print_guarded(__isl_take isl_printer *p, - __isl_take isl_set *guard, __isl_take isl_set *context, - __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, void *user), - void *user); - #endif Index: polly/trunk/lib/External/ppcg/print.c =================================================================== --- polly/trunk/lib/External/ppcg/print.c +++ polly/trunk/lib/External/ppcg/print.c @@ -11,6 +11,7 @@ #include #include "print.h" +#include "util.h" __isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p) { @@ -30,21 +31,276 @@ return p; } -static int print_macro(enum isl_ast_op_type type, void *user) +/* Names of notes that keep track of whether min/max + * macro definitions have already been printed. + */ +static const char *ppcg_max_printed = "ppcg_max_printed"; +static const char *ppcg_min_printed = "ppcg_min_printed"; + +/* Has the macro definition corresponding to "note_name" been printed + * to "p" before? + * That is, does "p" have an associated "note_name" note? + */ +static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name) +{ + isl_ctx *ctx; + isl_id *id; + isl_bool printed; + + if (!p) + return isl_bool_error; + + ctx = isl_printer_get_ctx(p); + id = isl_id_alloc(ctx, note_name, NULL); + printed = isl_printer_has_note(p, id); + isl_id_free(id); + + return printed; +} + +/* Keep track of the fact that the macro definition corresponding + * to "note_name" has been printed to "p" by attaching a note with + * that name. The value of the note is of no importance, but it + * has to be a valid isl_id, so the note identifier is reused + * as the note. + */ +static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p, + const char *note_name) +{ + isl_ctx *ctx; + isl_id *id; + + if (!p) + return NULL; + + ctx = isl_printer_get_ctx(p); + id = isl_id_alloc(ctx, note_name, NULL); + return isl_printer_set_note(p, id, isl_id_copy(id)); +} + +/* Print a macro definition "def" for the macro "name" to "p", + * unless such a macro definition has been printed to "p" before. + * "note_name" is used as the name of the note that keeps track + * of whether this printing has happened. + */ +static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p, + const char *name, const char *def, const char *note_name) +{ + isl_bool printed; + + printed = printed_before(p, note_name); + if (printed < 0) + return isl_printer_free(p); + if (printed) + return p; + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "#define "); + p = isl_printer_print_str(p, name); + p = isl_printer_print_str(p, def); + p = isl_printer_end_line(p); + + p = mark_printed(p, note_name); + + return p; +} + +/* Structure for keeping track of definitions of some macros. + */ +struct ppcg_macros { + const char *min; + const char *max; +}; + +/* Free the memory allocated by a struct ppcg_macros. + */ +static void ppcg_macros_free(void *user) +{ + free(user); +} + +/* Default macro definitions (when GNU extensions are allowed). + */ +struct ppcg_macros ppcg_macros_default = { + .min = "(x,y) " + "({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); " + "_x < _y ? _x : _y; })", + .max = "(x,y) " + "({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); " + "_x > _y ? _x : _y; })", +}; + +/* Name used for the note that keeps track of macro definitions. + */ +static const char *ppcg_macros = "ppcg_macros"; + +/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max + * to "min" and "max" and store them in "p". + * + * In particular, create a ppcg_macros object and attach it + * as a note to the printer. + */ +__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p, + const char *min, const char *max) +{ + isl_ctx *ctx; + isl_id *id, *macros_id; + struct ppcg_macros *macros; + + if (!p) + return NULL; + + ctx = isl_printer_get_ctx(p); + macros = isl_alloc_type(ctx, struct ppcg_macros); + if (!macros) + return isl_printer_free(p); + macros->min = min; + macros->max = max; + id = isl_id_alloc(ctx, ppcg_macros, NULL); + macros_id = isl_id_alloc(ctx, NULL, macros); + if (!macros_id) + ppcg_macros_free(macros); + else + macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free); + + p = isl_printer_set_note(p, id, macros_id); + + return p; +} + +/* Return the ppcg_macros object that holds the currently active + * macro definitions in "p". + * If "p" has a note with macro definitions, then return those. + * Otherwise, return the default macro definitions. + */ +static struct ppcg_macros *get_macros(__isl_keep isl_printer *p) +{ + isl_id *id; + isl_bool has_macros; + struct ppcg_macros *macros; + + id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL); + has_macros = isl_printer_has_note(p, id); + if (has_macros < 0 || !has_macros) { + isl_id_free(id); + if (has_macros < 0) + return NULL; + return &ppcg_macros_default; + } + id = isl_printer_get_note(p, id); + macros = isl_id_get_user(id); + isl_id_free(id); + + return macros; +} + +/* Print the currently active macro definition for ppcg_max. + */ +static __isl_give isl_printer *print_max(__isl_take isl_printer *p) +{ + struct ppcg_macros *macros; + + macros = get_macros(p); + if (!macros) + return isl_printer_free(p); + return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed); +} + +/* Print the currently active macro definition for ppcg_min. + */ +static __isl_give isl_printer *print_min(__isl_take isl_printer *p) +{ + struct ppcg_macros *macros; + + macros = get_macros(p); + if (!macros) + return isl_printer_free(p); + return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed); +} + +/* Print a macro definition for "type" to "p". + * If GNU extensions are allowed, then print a specialized definition + * for isl_ast_op_min and isl_ast_op_max. + * Otherwise, use the default isl definition. + */ +__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type, + __isl_take isl_printer *p) +{ + isl_ctx *ctx; + struct ppcg_options *options; + + if (!p) + return NULL; + + ctx = isl_printer_get_ctx(p); + options = isl_ctx_peek_options(ctx, &ppcg_options_args); + if (!options || !options->allow_gnu_extensions) + return isl_ast_op_type_print_macro(type, p); + + switch (type) { + case isl_ast_op_max: + return print_max(p); + case isl_ast_op_min: + return print_min(p); + default: + return isl_ast_op_type_print_macro(type, p); + } +} + +/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type + * callback that prints a macro definition for "type". + */ +static isl_stat print_macro(enum isl_ast_op_type type, void *user) { isl_printer **p = user; - if (type == isl_ast_op_fdiv_q) - return 0; + *p = ppcg_print_macro(type, *p); + if (!*p) + return isl_stat_error; + + return isl_stat_ok; +} + +/* Print the required macros for "expr". + */ +__isl_give isl_printer *ppcg_ast_expr_print_macros( + __isl_keep isl_ast_expr *expr, __isl_take isl_printer *p) +{ + if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0) + return isl_printer_free(p); + return p; +} - *p = isl_ast_op_type_print_macro(type, *p); +/* isl_id_to_ast_expr_foreach callback that prints the required + * macro definitions for "val". + */ +static isl_stat print_expr_macros(__isl_take isl_id *key, + __isl_take isl_ast_expr *val, void *user) +{ + isl_printer **p = user; - return 0; + *p = ppcg_ast_expr_print_macros(val, *p); + isl_id_free(key); + isl_ast_expr_free(val); + + if (!*p) + return isl_stat_error; + return isl_stat_ok; } -/* Print the required macros for "node", except one for floord. - * The caller is assumed to have printed a macro for floord already - * as it may also appear in the declarations and the statements. +/* Print the required macro definitions for the body of a statement in which + * the access expressions are replaced by the isl_ast_expr objects + * in "ref2expr". + */ +__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p, + __isl_keep isl_id_to_ast_expr *ref2expr) +{ + if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0) + return isl_printer_free(p); + return p; +} + +/* Print the required macros for "node". */ __isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p, __isl_keep isl_ast_node *node) @@ -54,67 +310,105 @@ return p; } -/* Print "extent" as a sequence of - * - * [1 + maximal_value] +/* Names used for the macros that may appear in a printed isl AST. + */ +const char *ppcg_min = "ppcg_min"; +const char *ppcg_max = "ppcg_max"; +const char *ppcg_fdiv_q = "ppcg_fdiv_q"; + +/* Set the names of the macros that may appear in a printed isl AST. + */ +__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p) +{ + p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min); + p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max); + p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q); + + return p; +} + +/* Given a multi affine expression "mpa" without domain, modify it to have + * the schedule space of "build" as domain. * - * one for each dimension. - * "build" is used to simplify the size expressions, if any. + * If the schedule space of "build" is a parameter space, then nothing + * needs to be done. + * Otherwise, "mpa" is first given a 0D domain and then it is combined + * with a mapping from the schedule space of "build" to the same 0D domain. */ -static __isl_give isl_printer *print_extent(__isl_take isl_printer *p, - __isl_keep isl_set *extent, __isl_keep isl_ast_build *build) +__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff( + __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build) { - int i, n; + isl_bool params; + isl_space *space; + isl_multi_aff *ma; + + space = isl_ast_build_get_schedule_space(build); + params = isl_space_is_params(space); + if (params < 0 || params) { + isl_space_free(space); + if (params < 0) + return isl_multi_pw_aff_free(mpa); + return mpa; + } + space = isl_space_from_domain(space); + ma = isl_multi_aff_zero(space); + mpa = isl_multi_pw_aff_from_range(mpa); + mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma); - n = isl_set_dim(extent, isl_dim_set); - if (n == 0) - return p; + return mpa; +} - for (i = 0; i < n; ++i) { - isl_set *dom; - isl_local_space *ls; - isl_aff *one; - isl_pw_aff *bound; - isl_ast_expr *expr; - - bound = isl_set_dim_max(isl_set_copy(extent), i); - dom = isl_pw_aff_domain(isl_pw_aff_copy(bound)); - ls = isl_local_space_from_space(isl_set_get_space(dom)); - one = isl_aff_zero_on_domain(ls); - one = isl_aff_add_constant_si(one, 1); - bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one)); - - p = isl_printer_print_str(p, "["); - expr = isl_ast_build_expr_from_pw_aff(build, bound); - p = isl_printer_print_ast_expr(p, expr); - p = isl_printer_print_str(p, "]"); +/* Build an access AST expression from "size" using "build". + * "size" does not have a domain, but "build" may have a proper schedule space. + * First modify "size" to have that schedule space as domain. + */ +__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size, + __isl_keep isl_ast_build *build) +{ + size = ppcg_attach_multi_pw_aff(size, build); + return isl_ast_build_access_from_multi_pw_aff(build, size); +} - isl_ast_expr_free(expr); - } +/* Print a declaration for an array with element type "base_type" and + * size "size" to "p". + */ +__isl_give isl_printer *ppcg_print_declaration_with_size( + __isl_take isl_printer *p, const char *base_type, + __isl_keep isl_ast_expr *size) +{ + if (!base_type || !size) + return isl_printer_free(p); + + p = ppcg_ast_expr_print_macros(size, p); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, base_type); + p = isl_printer_print_str(p, " "); + p = isl_printer_print_ast_expr(p, size); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); return p; } /* Print a declaration for array "array" to "p", using "build" * to simplify any size expressions. + * + * The size is computed from the extent of the array and is + * subsequently converted to an "access expression" by "build". */ __isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p, struct pet_array *array, __isl_keep isl_ast_build *build) { - const char *name; + isl_multi_pw_aff *size; + isl_ast_expr *expr; if (!array) return isl_printer_free(p); - name = isl_set_get_tuple_name(array->extent); - - p = isl_printer_start_line(p); - p = isl_printer_print_str(p, array->element_type); - p = isl_printer_print_str(p, " "); - p = isl_printer_print_str(p, name); - p = print_extent(p, array->extent, build); - p = isl_printer_print_str(p, ";"); - p = isl_printer_end_line(p); + size = ppcg_size_from_extent(isl_set_copy(array->extent)); + expr = isl_ast_build_access_from_multi_pw_aff(build, size); + p = ppcg_print_declaration_with_size(p, array->element_type, expr); + isl_ast_expr_free(expr); return p; } @@ -164,67 +458,3 @@ { return print_declarations(p, scop, 0); } - -/* Internal data structure for print_guarded_user. - * - * fn is the function that should be called to print the body. - * user is the argument that should be passed to this function. - */ -struct ppcg_print_guarded_data { - __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, void *user); - void *user; -}; - -/* Print the body of the if statement expressing the guard passed - * to "ppcg_print_guarded" by calling data->fn. - */ -static __isl_give isl_printer *print_guarded_user(__isl_take isl_printer *p, - __isl_take isl_ast_print_options *options, - __isl_keep isl_ast_node *node, void *user) -{ - struct ppcg_print_guarded_data *data = user; - - p = data->fn(p, data->user); - - isl_ast_print_options_free(options); - return p; -} - -/* Print a condition for the given "guard" within the given "context" - * on "p", calling "fn" with "user" to print the body of the if statement. - * If the guard is implied by the context, then no if statement is printed - * and the body is printed directly to "p". - * - * Both "guard" and "context" are assumed to be parameter sets. - * - * We slightly abuse the AST generator to print this guard. - * In particular, we create a trivial schedule for an iteration - * domain with a single instance, restricted by the guard. - */ -__isl_give isl_printer *ppcg_print_guarded(__isl_take isl_printer *p, - __isl_take isl_set *guard, __isl_take isl_set *context, - __isl_give isl_printer *(*fn)(__isl_take isl_printer *p, void *user), - void *user) -{ - struct ppcg_print_guarded_data data = { fn, user }; - isl_ctx *ctx; - isl_union_map *schedule; - isl_ast_build *build; - isl_ast_node *tree; - isl_ast_print_options *options; - - ctx = isl_printer_get_ctx(p); - guard = isl_set_from_params(guard); - schedule = isl_union_map_from_map(isl_map_from_domain(guard)); - build = isl_ast_build_from_context(context); - tree = isl_ast_build_node_from_schedule_map(build, schedule); - isl_ast_build_free(build); - - options = isl_ast_print_options_alloc(ctx); - options = isl_ast_print_options_set_print_user(options, - &print_guarded_user, &data); - p = isl_ast_node_print(tree, p, options); - isl_ast_node_free(tree); - - return p; -} Index: polly/trunk/lib/External/ppcg/schedule.h =================================================================== --- polly/trunk/lib/External/ppcg/schedule.h +++ polly/trunk/lib/External/ppcg/schedule.h @@ -2,55 +2,20 @@ #define _SCHEDULE_H #include -#include -#include -#include - -#include - -/* An access to an outer array element or an iterator. - * Accesses to iterators have an access relation that maps to an unnamed space. - * An access may be both read and write. - * If the access relation is empty, then the output dimension may - * not be equal to the dimension of the corresponding array. - */ -struct gpu_stmt_access { - /* Access reads elements */ - int read; - /* Access writes elements */ - int write; - /* All writes are definite writes. */ - int exact_write; - /* The number of index expressions specified in the access. */ - int n_index; - - /* May access relation */ - isl_map *access; - /* May access relation with as domain a mapping from iteration domain - * to a reference identifier. - */ - isl_map *tagged_access; - /* The reference id of the corresponding pet_expr. */ - isl_id *ref_id; - - struct gpu_stmt_access *next; -}; - -struct gpu_stmt { - isl_id *id; - struct pet_stmt *stmt; - - /* Linked list of accesses. */ - struct gpu_stmt_access *accesses; -}; - -__isl_give isl_map *project_out(__isl_take isl_space *dim, - int len, int first, int n); -__isl_give isl_map *projection(__isl_take isl_space *dim, - int src_len, int dst_len); +#include +#include +#include + +#include "ppcg_options.h" + __isl_give isl_set *parametrization(__isl_take isl_space *space, int len, int first, __isl_keep isl_id_list *names); -__isl_give isl_set *extend(__isl_take isl_set *set, int dst_len); -__isl_give isl_union_map *align_range(__isl_take isl_union_map *umap); + +__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx, + struct ppcg_options *options, + __isl_give isl_schedule *(*compute)(void *user), void *user); + +__isl_give isl_schedule_node *ppcg_set_schedule_node_type( + __isl_take isl_schedule_node *node, enum isl_ast_loop_type type); #endif Index: polly/trunk/lib/External/ppcg/schedule.c =================================================================== --- polly/trunk/lib/External/ppcg/schedule.c +++ polly/trunk/lib/External/ppcg/schedule.c @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -18,41 +19,6 @@ #include "schedule.h" -/* Construct a map from a len-dimensional domain to - * a (len-n)-dimensional domain that projects out the n coordinates - * starting at first. - * "dim" prescribes the parameters. - */ -__isl_give isl_map *project_out(__isl_take isl_space *dim, - int len, int first, int n) -{ - int i, j; - isl_basic_map *bmap; - - dim = isl_space_add_dims(dim, isl_dim_in, len); - dim = isl_space_add_dims(dim, isl_dim_out, len - n); - bmap = isl_basic_map_universe(dim); - - for (i = 0, j = 0; i < len; ++i) { - if (i >= first && i < first + n) - continue; - bmap = isl_basic_map_equate(bmap, isl_dim_in, i, isl_dim_out, j); - ++j; - } - - return isl_map_from_basic_map(bmap); -} - -/* Construct a projection that maps a src_len dimensional domain - * to its first dst_len coordinates. - * "dim" prescribes the parameters. - */ -__isl_give isl_map *projection(__isl_take isl_space *dim, - int src_len, int dst_len) -{ - return project_out(dim, src_len, dst_len, src_len - dst_len); -} - /* Add parameters with identifiers "ids" to "set". */ static __isl_give isl_set *add_params(__isl_take isl_set *set, @@ -114,79 +80,86 @@ return parametrize(set, first, ids); } -/* Extend "set" with unconstrained coordinates to a total length of "dst_len". +/* Load and return a schedule from a file called "filename". */ -__isl_give isl_set *extend(__isl_take isl_set *set, int dst_len) +static __isl_give isl_schedule *load_schedule(isl_ctx *ctx, + const char *filename) { - int n_set; - isl_space *dim; - isl_map *map; - - dim = isl_set_get_space(set); - n_set = isl_space_dim(dim, isl_dim_set); - dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_set); - map = projection(dim, dst_len, n_set); - map = isl_map_reverse(map); + FILE *file; + isl_schedule *schedule; - return isl_set_apply(set, map); -} - -/* Set max_out to the maximal number of output dimensions over - * all maps. - */ -static isl_stat update_max_out(__isl_take isl_map *map, void *user) -{ - int *max_out = user; - int n_out = isl_map_dim(map, isl_dim_out); - - if (n_out > *max_out) - *max_out = n_out; + file = fopen(filename, "r"); + if (!file) { + fprintf(stderr, "Unable to open '%s' for reading\n", filename); + return NULL; + } + schedule = isl_schedule_read_from_file(ctx, file); + fclose(file); - isl_map_free(map); - return isl_stat_ok; + return schedule; } -struct align_range_data { - int max_out; - isl_union_map *res; -}; - -/* Extend the dimension of the range of the given map to data->max_out and - * then add the result to data->res. +/* Save the schedule "schedule" to a file called "filename". + * The schedule is printed in block style. */ -static isl_stat map_align_range(__isl_take isl_map *map, void *user) +static void save_schedule(__isl_keep isl_schedule *schedule, + const char *filename) { - struct align_range_data *data = user; - int i; - isl_space *dim; - isl_map *proj; - int n_out = isl_map_dim(map, isl_dim_out); + FILE *file; + isl_ctx *ctx; + isl_printer *p; - dim = isl_union_map_get_space(data->res); - proj = isl_map_reverse(projection(dim, data->max_out, n_out)); - for (i = n_out; i < data->max_out; ++i) - proj = isl_map_fix_si(proj, isl_dim_out, i, 0); + if (!schedule) + return; - map = isl_map_apply_range(map, proj); - - data->res = isl_union_map_add_map(data->res, map); + file = fopen(filename, "w"); + if (!file) { + fprintf(stderr, "Unable to open '%s' for writing\n", filename); + return; + } + ctx = isl_schedule_get_ctx(schedule); + p = isl_printer_to_file(ctx, file); + p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK); + p = isl_printer_print_schedule(p, schedule); + isl_printer_free(p); + fclose(file); +} + +/* Obtain a schedule, either by reading it form a file + * or by computing it using "compute". + * Also take care of saving the computed schedule and/or + * dumping the obtained schedule if requested by the user. + */ +__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx, + struct ppcg_options *options, + __isl_give isl_schedule *(*compute)(void *user), void *user) +{ + isl_schedule *schedule; + + if (options->load_schedule_file) { + schedule = load_schedule(ctx, options->load_schedule_file); + } else { + schedule = compute(user); + if (options->save_schedule_file) + save_schedule(schedule, options->save_schedule_file); + } + if (options->debug->dump_schedule) + isl_schedule_dump(schedule); - return isl_stat_ok; + return schedule; } -/* Extend the ranges of the maps in the union map such they all have - * the same dimension. +/* Mark all dimensions in the band node "node" to be of "type". */ -__isl_give isl_union_map *align_range(__isl_take isl_union_map *umap) +__isl_give isl_schedule_node *ppcg_set_schedule_node_type( + __isl_take isl_schedule_node *node, enum isl_ast_loop_type type) { - struct align_range_data data; - - data.max_out = 0; - isl_union_map_foreach_map(umap, &update_max_out, &data.max_out); + int i, n; - data.res = isl_union_map_empty(isl_union_map_get_space(umap)); - isl_union_map_foreach_map(umap, &map_align_range, &data); + n = isl_schedule_node_band_n_member(node); + for (i = 0; i < n; ++i) + node = isl_schedule_node_band_member_set_ast_loop_type(node, i, + type); - isl_union_map_free(umap); - return data.res; + return node; } Index: polly/trunk/lib/External/ppcg/tests/iterator.c =================================================================== --- polly/trunk/lib/External/ppcg/tests/iterator.c +++ polly/trunk/lib/External/ppcg/tests/iterator.c @@ -0,0 +1,18 @@ +#include + +int main() +{ + int i; + int a[101]; + + i = 0; +#pragma scop + for (i = 0; i < 100; ++i) + a[i] = i; + a[i] = i; +#pragma endscop + if (a[100] != 100) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} Index: polly/trunk/lib/External/ppcg/tests/live_out.c =================================================================== --- polly/trunk/lib/External/ppcg/tests/live_out.c +++ polly/trunk/lib/External/ppcg/tests/live_out.c @@ -0,0 +1,22 @@ +#include + +/* Check that a write access is not removed from the live-out + * accesses only because a strict subset of the (potentially) + * accessed elements are killed by a later write. + */ +int main() +{ + int A[10]; + + A[1] = 0; +#pragma scop + int i = 1; + i = i * i; + A[i] = 1; + A[0] = 0; +#pragma endscop + if (A[1] != 1) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} Index: polly/trunk/lib/External/ppcg/tests/local.c =================================================================== --- polly/trunk/lib/External/ppcg/tests/local.c +++ polly/trunk/lib/External/ppcg/tests/local.c @@ -0,0 +1,22 @@ +#include + +int main() +{ + int A[100]; + +#pragma scop + { + int B[100]; + B[0] = 0; + for (int i = 1; i < 100; ++i) + B[i] = B[i - 1] + 1; + for (int i = 0; i < 100; ++i) + A[i] = B[i]; + } +#pragma endscop + for (int i = 0; i < 100; ++i) + if (A[i] != i) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} Index: polly/trunk/lib/External/ppcg/tests/struct4.c =================================================================== --- polly/trunk/lib/External/ppcg/tests/struct4.c +++ polly/trunk/lib/External/ppcg/tests/struct4.c @@ -0,0 +1,27 @@ +#include + +struct s { + int a; + int b; +}; + +int main() +{ + int a[10]; + + for (int i = 0; i < 10; ++i) + a[i] = 0; +#pragma scop + for (int i = 0; i < 10; ++i) { + struct s b; + b.a = 1; + b.b = i; + a[i] = b.a + b.b; + } +#pragma endscop + for (int i = 0; i < 10; ++i) + if (a[i] != 1 + i) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} Index: polly/trunk/lib/External/ppcg/util.h =================================================================== --- polly/trunk/lib/External/ppcg/util.h +++ polly/trunk/lib/External/ppcg/util.h @@ -3,6 +3,9 @@ #include +#include +#include + /* Compare the prefix of "s" to "prefix" up to the length of "prefix". */ static inline int prefixcmp(const char *s, const char *prefix) @@ -10,4 +13,10 @@ return strncmp(s, prefix, strlen(prefix)); } +__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space, + int val); +__isl_give isl_multi_val *ppcg_multi_val_from_int_list( + __isl_take isl_space *space, int *list); +__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set); + #endif Index: polly/trunk/lib/External/ppcg/util.c =================================================================== --- polly/trunk/lib/External/ppcg/util.c +++ polly/trunk/lib/External/ppcg/util.c @@ -0,0 +1,105 @@ +/* + * Copyright 2012-2013 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France + */ + +#include +#include +#include +#include + +#include "util.h" + +/* Construct an isl_multi_val living in "space" with all values equal to "val". + */ +__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space, + int val) +{ + int i, n; + isl_ctx *ctx; + isl_val *v; + isl_multi_val *mv; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + n = isl_space_dim(space, isl_dim_set); + mv = isl_multi_val_zero(space); + v = isl_val_int_from_si(ctx, val); + for (i = 0; i < n; ++i) + mv = isl_multi_val_set_val(mv, i, isl_val_copy(v)); + isl_val_free(v); + + return mv; +} + +/* Construct an isl_multi_val living in "space" with values specified + * by "list". "list" is assumed to have at least as many entries + * as the set dimension of "space". + */ +__isl_give isl_multi_val *ppcg_multi_val_from_int_list( + __isl_take isl_space *space, int *list) +{ + int i, n; + isl_ctx *ctx; + isl_multi_val *mv; + + if (!space) + return NULL; + + ctx = isl_space_get_ctx(space); + n = isl_space_dim(space, isl_dim_set); + mv = isl_multi_val_zero(space); + for (i = 0; i < n; ++i) { + isl_val *v; + + v = isl_val_int_from_si(ctx, list[i]); + mv = isl_multi_val_set_val(mv, i, v); + } + + return mv; +} + +/* Compute the size of a bounding box around the origin and "set", + * where "set" is assumed to contain only non-negative elements. + * In particular, compute the maximal value of "set" in each direction + * and add one. + */ +__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set) +{ + int i, n; + isl_multi_pw_aff *mpa; + + n = isl_set_dim(set, isl_dim_set); + mpa = isl_multi_pw_aff_zero(isl_set_get_space(set)); + for (i = 0; i < n; ++i) { + isl_space *space; + isl_aff *one; + isl_pw_aff *bound; + + if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) { + const char *name; + name = isl_set_get_tuple_name(set); + if (!name) + name = ""; + fprintf(stderr, "unable to determine extent of '%s' " + "in dimension %d\n", name, i); + set = isl_set_free(set); + } + bound = isl_set_dim_max(isl_set_copy(set), i); + + space = isl_pw_aff_get_domain_space(bound); + one = isl_aff_zero_on_domain(isl_local_space_from_space(space)); + one = isl_aff_add_constant_si(one, 1); + bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one)); + mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound); + } + isl_set_free(set); + + return mpa; +}