Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -238,14 +238,22 @@ /// Create an isl_union_set, which describes the isolate option based on /// IsoalteDomain. /// -/// @param IsolateDomain An isl_set whose last dimension is the only one that -/// should belong to the current band node. +/// @param IsolateDomain An isl_set whose @p OutDimsNum last dimensions should +/// belong to the current band node. +/// @param OutDimsNum A number of dimensions that should belong to +/// the current band node. static __isl_give isl_union_set * -getIsolateOptions(__isl_take isl_set *IsolateDomain) { +getIsolateOptions(__isl_take isl_set *IsolateDomain, unsigned OutDimsNum) { auto Dims = isl_set_dim(IsolateDomain, isl_dim_set); + assert(OutDimsNum <= Dims && + "The isl_set IsolateDomain is used to describe the range of schedule " + "dimensions values, which should be isolated. Consequently, the " + "number of its dimensions should be greater than or equal to the " + "number of the schedule dimensions."); auto *IsolateRelation = isl_map_from_domain(IsolateDomain); - IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0, - isl_dim_in, Dims - 1, 1); + IsolateRelation = + isl_map_move_dims(IsolateRelation, isl_dim_out, 0, isl_dim_in, + Dims - OutDimsNum, OutDimsNum); auto *IsolateOption = isl_map_wrap(IsolateRelation); auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr); return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id)); @@ -264,6 +272,22 @@ return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id)); } +/// Create an isl_union_set, which describes the option of the form +/// [isolate[] -> unroll[x]]. +/// +/// @param Ctx An isl_ctx, which is used to create the isl_union_set. +static __isl_give isl_union_set *getUnrollIsolatedSetOptions(isl_ctx *Ctx) { + auto *Space = isl_space_alloc(Ctx, 0, 0, 1); + auto *UnrollIsolatedSetOption = isl_map_universe(Space); + auto *DimInId = isl_id_alloc(Ctx, "isolate", nullptr); + auto *DimOutId = isl_id_alloc(Ctx, "unroll", nullptr); + UnrollIsolatedSetOption = + isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_in, DimInId); + UnrollIsolatedSetOption = + isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_out, DimOutId); + return isl_union_set_from_set(isl_map_wrap(UnrollIsolatedSetOption)); +} + /// Make the last dimension of Set to take values from 0 to VectorWidth - 1. /// /// @param Set A set, which should be modified. @@ -324,7 +348,7 @@ auto *ScheduleRange = isl_map_range(ScheduleRelation); auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth); auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain)); - auto *IsolateOption = getIsolateOptions(IsolateDomain); + auto *IsolateOption = getIsolateOptions(IsolateDomain, 1); Node = isl_schedule_node_parent(Node); Node = isl_schedule_node_parent(Node); auto *Options = isl_union_set_union(IsolateOption, AtomicOption); @@ -1119,6 +1143,47 @@ return MapOldIndVar; } +/// Isolate a set of partial tile prefixes and unroll the isolated part. +/// +/// The set should ensure that it contains only partial tile prefixes that have +/// exactly Mr x Nr iterations of the two innermost loops produced by +/// the optimization of the matrix multiplication. Mr and Nr are parameters of +/// the micro-kernel. +/// +/// In case of parametric bounds, this helps to auto-vectorize the unrolled +/// innermost loops, using the SLP vectorizer. +/// +/// @param Node The schedule node to be modified. +/// @param MicroKernelParams Parameters of the micro-kernel +/// to be taken into account. +/// @return The modified isl_schedule_node. +static __isl_give isl_schedule_node * +isolateAndUnrollMatMulInnerLoops(__isl_take isl_schedule_node *Node, + struct MicroKernelParamsTy MicroKernelParams) { + auto *Child = isl_schedule_node_get_child(Node, 0); + auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_relation(Child); + isl_schedule_node_free(Child); + auto *Prefix = isl_map_range(isl_map_from_union_map(UnMapOldIndVar)); + auto Dims = isl_set_dim(Prefix, isl_dim_set); + Prefix = isl_set_project_out(Prefix, isl_dim_set, Dims - 1, 1); + Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr); + Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr); + auto *IsolateOption = getIsolateOptions( + isl_set_add_dims(isl_set_copy(Prefix), isl_dim_set, 3), 3); + auto *Ctx = isl_schedule_node_get_ctx(Node); + auto *AtomicOption = getAtomicOptions(Ctx); + auto *Options = + isl_union_set_union(IsolateOption, isl_union_set_copy(AtomicOption)); + Options = isl_union_set_union(Options, getUnrollIsolatedSetOptions(Ctx)); + Node = isl_schedule_node_band_set_ast_build_options(Node, Options); + Node = isl_schedule_node_parent(isl_schedule_node_parent(Node)); + IsolateOption = getIsolateOptions(Prefix, 3); + Options = isl_union_set_union(IsolateOption, AtomicOption); + Node = isl_schedule_node_band_set_ast_build_options(Node, Options); + Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0); + return Node; +} + __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern( __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI, MatMulInfoTy &MMI) { @@ -1144,6 +1209,7 @@ Node, MicroKernelParams, MacroKernelParams); if (!MapOldIndVar) return Node; + Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams); return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams, MacroKernelParams, MMI); } @@ -1179,7 +1245,7 @@ MatMulInfoTy MMI; if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) { DEBUG(dbgs() << "The matrix multiplication pattern was detected\n"); - Node = optimizeMatMulPattern(Node, OAI->TTI, MMI); + return optimizeMatMulPattern(Node, OAI->TTI, MMI); } return standardBandOpts(Node, User); Index: test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll =================================================================== --- test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -42,8 +42,6 @@ ; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) ; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) { ; CHECK-NEXT: // Register tiling - Points -; CHECK-NEXT: // 1st level tiling - Tiles -; CHECK-NEXT: // 1st level tiling - Points ; CHECK-NEXT: { ; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); ; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); Index: test/ScheduleOptimizer/pattern-matching-based-opts_3.ll =================================================================== --- test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -38,8 +38,6 @@ ; CHECK-NEXT: for (int c1 = 0; c1 <= 263; c1 += 1) ; CHECK-NEXT: for (int c2 = 0; c2 <= 1023; c2 += 1) { ; CHECK-NEXT: // Register tiling - Points -; CHECK-NEXT: // 1st level tiling - Tiles -; CHECK-NEXT: // 1st level tiling - Points ; CHECK-NEXT: { ; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0, c2); ; CHECK-NEXT: Stmt_Copy_0(4 * c1, 8 * c0 + 1, c2); @@ -101,8 +99,6 @@ ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) ; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points ; EXTRACTION-OF-MACRO-KERNEL-NEXT: { ; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); Index: test/ScheduleOptimizer/pattern-matching-based-opts_5.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/pattern-matching-based-opts_5.ll @@ -0,0 +1,363 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 < %s \ +; RUN: | FileCheck %s +; +; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-codegen -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 -gvn -licm -slp-vectorizer \ +; RUN: -S < %s | FileCheck %s --check-prefix=AUTO-VECTORIZATION +; +; +; /* We isolate a set of partial tile prefixes, which contains only partial +; tile prefixes that have exactly Mr x Nr iterations of the two innermost +; loops produced by the optimization of the matrix multiplication. Mr and +; Nr are parameters of the micro-kernel (see getMicroKernelParams and +; getMacroKernelParams from lib/Transform/ScheduleOptimizer.cpp for +; details). This test check that in case of parametric bounds it helps +; auto-vectorize the unrolled innermost loops, using the SLP +; vectorizer. */ +; /* C := A * B + C */ +; for (i = 0; i < _PB_NI; i++) +; for (j = 0; j < _PB_NJ; j++) +; for (k = 0; k < _PB_NK; ++k) +; C[i][j] += A[i][k] * B[k][j]; +; +; CHECK: if (ni >= 1) { +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: for (int c0 = 0; c0 <= floord(nj - 1, 2048); c0 += 1) +; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nk - 1, 256); c1 += 1) { +; CHECK-NEXT: for (int c3 = 2048 * c0; c3 <= min(nj - 1, 2048 * c0 + 2047); c3 += 1) +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c2 = 0; c2 <= floord(ni - 1, 96); c2 += 1) { +; CHECK-NEXT: if (c0 == 0) +; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= min(ni - 1, 96 * c2 + 95); c3 += 1) +; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(nk - 1, 256 * c1 + 255); c5 += 1) +; CHECK-NEXT: CopyStmt_1(c3, 0, c5); +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: { +; CHECK-NEXT: if (ni >= 96 * c2 + 4) +; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + nj / 8 - 1); c3 += 1) { +; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + ni / 4 - 1); c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: if (96 * c2 + 95 >= ni) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: for (int c6 = 0; c6 < ni % 4; c6 += 1) +; CHECK-NEXT: for (int c7 = 0; c7 <= 7; c7 += 1) +; CHECK-NEXT: Stmt_for_body6(-((ni + 4) % 4) + ni + c6, 2048 * c0 + 8 * c3 + c7, 256 * c1 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: if (96 * c2 + 3 >= ni || (2048 * c0 + 2047 >= nj && nj % 8 >= 1)) +; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + (nj - 1) / 8); c3 += 1) +; CHECK-NEXT: if (96 * c2 + 3 >= ni || 2048 * c0 + 8 * c3 + 7 >= nj) +; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + (ni - 1) / 4); c4 += 1) +; CHECK-NEXT: if ((ni >= 96 * c2 + 4 && 2048 * c0 + 8 * c3 + 7 >= nj) || 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: for (int c6 = 0; c6 <= min(3, ni - 96 * c2 - 4 * c4 - 1); c6 += 1) +; CHECK-NEXT: for (int c7 = 0; c7 <= min(7, nj - 2048 * c0 - 8 * c3 - 1); c7 += 1) +; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + c6, 2048 * c0 + 8 * c3 + c7, 256 * c1 + c5); +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; CHECK-NEXT: } +; + +; AUTO-VECTORIZATION:polly.loop_exit166.loopexit: ; preds = %polly.loop_header164 +; AUTO-VECTORIZATION-NEXT: %94 = phi <4 x double> [ %134, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %95 = phi <4 x double> [ %138, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %96 = phi <4 x double> [ %144, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %97 = phi <4 x double> [ %146, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %98 = phi <4 x double> [ %154, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %99 = phi <4 x double> [ %166, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %100 = phi <4 x double> [ %172, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %101 = phi <4 x double> [ %174, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %102 = bitcast double* %scevgep to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %94, <4 x double>* %102, align 8 +; AUTO-VECTORIZATION-NEXT: %103 = bitcast double* %scevgep257 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %95, <4 x double>* %103, align 8 +; AUTO-VECTORIZATION-NEXT: %104 = bitcast double* %scevgep333 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %96, <4 x double>* %104, align 8 +; AUTO-VECTORIZATION-NEXT: %105 = bitcast double* %scevgep409 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %97, <4 x double>* %105, align 8 +; AUTO-VECTORIZATION-NEXT: %106 = bitcast double* %scevgep485 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %98, <4 x double>* %106, align 8 +; AUTO-VECTORIZATION-NEXT: %107 = bitcast double* %scevgep561 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %99, <4 x double>* %107, align 8 +; AUTO-VECTORIZATION-NEXT: %108 = bitcast double* %scevgep637 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %100, <4 x double>* %108, align 8 +; AUTO-VECTORIZATION-NEXT: %109 = bitcast double* %scevgep713 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: store <4 x double> %101, <4 x double>* %109, align 8 +; AUTO-VECTORIZATION-NEXT: br label %polly.loop_exit166 + +; AUTO-VECTORIZATION:polly.loop_header164: ; preds = %polly.loop_header164, %polly.loop_preheader165 +; AUTO-VECTORIZATION-NEXT: %polly.indvar168 = phi i64 [ 0, %polly.loop_preheader165 ], [ %polly.indvar_next169, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %119 = phi <4 x double> [ %181, %polly.loop_preheader165 ], [ %134, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %120 = phi <4 x double> [ %183, %polly.loop_preheader165 ], [ %138, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %121 = phi <4 x double> [ %185, %polly.loop_preheader165 ], [ %144, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %122 = phi <4 x double> [ %187, %polly.loop_preheader165 ], [ %146, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %123 = phi <4 x double> [ %189, %polly.loop_preheader165 ], [ %154, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %124 = phi <4 x double> [ %191, %polly.loop_preheader165 ], [ %166, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %125 = phi <4 x double> [ %193, %polly.loop_preheader165 ], [ %172, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %126 = phi <4 x double> [ %195, %polly.loop_preheader165 ], [ %174, %polly.loop_header164 ] +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_A174 = add nsw i64 %polly.access.mul.Packed_A173, %polly.indvar168 +; AUTO-VECTORIZATION-NEXT: %polly.access.mul.Packed_A175 = mul nsw i64 %polly.access.add.Packed_A174, 4 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_A177 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.mul.Packed_A175 +; AUTO-VECTORIZATION-NEXT: %tmp_p_scalar_ = load double, double* %polly.access.Packed_A177, align 8 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B180 = add nsw i64 %polly.access.mul.Packed_B179, %polly.indvar168 +; AUTO-VECTORIZATION-NEXT: %polly.access.mul.Packed_B181 = mul nsw i64 %polly.access.add.Packed_B180, 8 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B183 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.mul.Packed_B181 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B196 = add nsw i64 %polly.access.mul.Packed_B181, 1 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B197 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B196 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B215 = add nsw i64 %polly.access.mul.Packed_B181, 2 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B216 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B215 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B234 = add nsw i64 %polly.access.mul.Packed_B181, 3 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B235 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B234 +; AUTO-VECTORIZATION-NEXT: %127 = bitcast double* %polly.access.Packed_B183 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %128 = load <4 x double>, <4 x double>* %127, align 8 +; AUTO-VECTORIZATION-NEXT: %129 = insertelement <4 x double> undef, double %tmp_p_scalar_, i32 0 +; AUTO-VECTORIZATION-NEXT: %130 = insertelement <4 x double> %129, double %tmp_p_scalar_, i32 1 +; AUTO-VECTORIZATION-NEXT: %131 = insertelement <4 x double> %130, double %tmp_p_scalar_, i32 2 +; AUTO-VECTORIZATION-NEXT: %132 = insertelement <4 x double> %131, double %tmp_p_scalar_, i32 3 +; AUTO-VECTORIZATION-NEXT: %133 = fmul <4 x double> %132, %128 +; AUTO-VECTORIZATION-NEXT: %134 = fadd <4 x double> %119, %133 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B253 = add nsw i64 %polly.access.mul.Packed_B181, 4 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B254 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B253 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B272 = add nsw i64 %polly.access.mul.Packed_B181, 5 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B273 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B272 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B291 = add nsw i64 %polly.access.mul.Packed_B181, 6 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B292 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B291 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_B310 = add nsw i64 %polly.access.mul.Packed_B181, 7 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_B311 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B310 +; AUTO-VECTORIZATION-NEXT: %135 = bitcast double* %polly.access.Packed_B254 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %136 = load <4 x double>, <4 x double>* %135, align 8 +; AUTO-VECTORIZATION-NEXT: %137 = fmul <4 x double> %132, %136 +; AUTO-VECTORIZATION-NEXT: %138 = fadd <4 x double> %120, %137 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_A322 = add nsw i64 %polly.access.mul.Packed_A175, 1 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_A323 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.add.Packed_A322 +; AUTO-VECTORIZATION-NEXT: %tmp_p_scalar_324 = load double, double* %polly.access.Packed_A323, align 8 +; AUTO-VECTORIZATION-NEXT: %139 = insertelement <4 x double> undef, double %tmp_p_scalar_324, i32 0 +; AUTO-VECTORIZATION-NEXT: %140 = insertelement <4 x double> %139, double %tmp_p_scalar_324, i32 1 +; AUTO-VECTORIZATION-NEXT: %141 = insertelement <4 x double> %140, double %tmp_p_scalar_324, i32 2 +; AUTO-VECTORIZATION-NEXT: %142 = insertelement <4 x double> %141, double %tmp_p_scalar_324, i32 3 +; AUTO-VECTORIZATION-NEXT: %143 = fmul <4 x double> %142, %128 +; AUTO-VECTORIZATION-NEXT: %144 = fadd <4 x double> %121, %143 +; AUTO-VECTORIZATION-NEXT: %145 = fmul <4 x double> %142, %136 +; AUTO-VECTORIZATION-NEXT: %146 = fadd <4 x double> %122, %145 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_A474 = add nsw i64 %polly.access.mul.Packed_A175, 2 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_A475 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.add.Packed_A474 +; AUTO-VECTORIZATION-NEXT: %tmp_p_scalar_476 = load double, double* %polly.access.Packed_A475, align 8 +; AUTO-VECTORIZATION-NEXT: %147 = bitcast double* %polly.access.Packed_B183 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %148 = load <4 x double>, <4 x double>* %147, align 8 +; AUTO-VECTORIZATION-NEXT: %149 = insertelement <4 x double> undef, double %tmp_p_scalar_476, i32 0 +; AUTO-VECTORIZATION-NEXT: %150 = insertelement <4 x double> %149, double %tmp_p_scalar_476, i32 1 +; AUTO-VECTORIZATION-NEXT: %151 = insertelement <4 x double> %150, double %tmp_p_scalar_476, i32 2 +; AUTO-VECTORIZATION-NEXT: %152 = insertelement <4 x double> %151, double %tmp_p_scalar_476, i32 3 +; AUTO-VECTORIZATION-NEXT: %153 = fmul <4 x double> %152, %148 +; AUTO-VECTORIZATION-NEXT: %154 = fadd <4 x double> %123, %153 +; AUTO-VECTORIZATION-NEXT: %155 = extractelement <4 x double> %136, i32 2 +; AUTO-VECTORIZATION-NEXT: %156 = extractelement <4 x double> %136, i32 3 +; AUTO-VECTORIZATION-NEXT: %157 = bitcast double* %polly.access.Packed_B254 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %158 = load <4 x double>, <4 x double>* %157, align 8 +; AUTO-VECTORIZATION-NEXT: %159 = extractelement <4 x double> %158, i32 0 +; AUTO-VECTORIZATION-NEXT: %160 = insertelement <4 x double> undef, double %159, i32 0 +; AUTO-VECTORIZATION-NEXT: %161 = extractelement <4 x double> %158, i32 1 +; AUTO-VECTORIZATION-NEXT: %162 = insertelement <4 x double> %160, double %161, i32 1 +; AUTO-VECTORIZATION-NEXT: %163 = insertelement <4 x double> %162, double %155, i32 2 +; AUTO-VECTORIZATION-NEXT: %164 = insertelement <4 x double> %163, double %156, i32 3 +; AUTO-VECTORIZATION-NEXT: %165 = fmul <4 x double> %152, %164 +; AUTO-VECTORIZATION-NEXT: %166 = fadd <4 x double> %124, %165 +; AUTO-VECTORIZATION-NEXT: %polly.access.add.Packed_A626 = add nsw i64 %polly.access.mul.Packed_A175, 3 +; AUTO-VECTORIZATION-NEXT: %polly.access.Packed_A627 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.add.Packed_A626 +; AUTO-VECTORIZATION-NEXT: %tmp_p_scalar_628 = load double, double* %polly.access.Packed_A627, align 8 +; AUTO-VECTORIZATION-NEXT: %167 = insertelement <4 x double> undef, double %tmp_p_scalar_628, i32 0 +; AUTO-VECTORIZATION-NEXT: %168 = insertelement <4 x double> %167, double %tmp_p_scalar_628, i32 1 +; AUTO-VECTORIZATION-NEXT: %169 = insertelement <4 x double> %168, double %tmp_p_scalar_628, i32 2 +; AUTO-VECTORIZATION-NEXT: %170 = insertelement <4 x double> %169, double %tmp_p_scalar_628, i32 3 +; AUTO-VECTORIZATION-NEXT: %171 = fmul <4 x double> %170, %148 +; AUTO-VECTORIZATION-NEXT: %172 = fadd <4 x double> %125, %171 +; AUTO-VECTORIZATION-NEXT: %173 = fmul <4 x double> %170, %158 +; AUTO-VECTORIZATION-NEXT: %174 = fadd <4 x double> %126, %173 +; AUTO-VECTORIZATION-NEXT: %polly.indvar_next169 = add nsw i64 %polly.indvar168, 1 +; AUTO-VECTORIZATION-NEXT: %polly.loop_cond171 = icmp sle i64 %polly.indvar168, %polly.adjust_ub170 +; AUTO-VECTORIZATION-NEXT: br i1 %polly.loop_cond171, label %polly.loop_header164, label %polly.loop_exit166.loopexit + +; AUTO-VECTORIZATION:polly.loop_preheader165: ; preds = %polly.loop_header155 +; AUTO-VECTORIZATION-NEXT: %175 = mul nsw i64 4, %polly.indvar159 +; AUTO-VECTORIZATION-NEXT: %176 = add nsw i64 %.pre-phi, %175 +; AUTO-VECTORIZATION-NEXT: %polly.access.mul.Packed_A173 = mul nsw i64 %polly.indvar159, 256 +; AUTO-VECTORIZATION-NEXT: %scevgep = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %111 +; AUTO-VECTORIZATION-NEXT: %scevgep200 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %112 +; AUTO-VECTORIZATION-NEXT: %scevgep219 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %113 +; AUTO-VECTORIZATION-NEXT: %scevgep238 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %114 +; AUTO-VECTORIZATION-NEXT: %scevgep257 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %115 +; AUTO-VECTORIZATION-NEXT: %scevgep276 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %116 +; AUTO-VECTORIZATION-NEXT: %scevgep295 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %117 +; AUTO-VECTORIZATION-NEXT: %scevgep314 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %118 +; AUTO-VECTORIZATION-NEXT: %177 = add nsw i64 %176, 1 +; AUTO-VECTORIZATION-NEXT: %scevgep333 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %111 +; AUTO-VECTORIZATION-NEXT: %scevgep352 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %112 +; AUTO-VECTORIZATION-NEXT: %scevgep371 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %113 +; AUTO-VECTORIZATION-NEXT: %scevgep390 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %114 +; AUTO-VECTORIZATION-NEXT: %scevgep409 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %115 +; AUTO-VECTORIZATION-NEXT: %scevgep428 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %116 +; AUTO-VECTORIZATION-NEXT: %scevgep447 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %117 +; AUTO-VECTORIZATION-NEXT: %scevgep466 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %118 +; AUTO-VECTORIZATION-NEXT: %178 = add nsw i64 %176, 2 +; AUTO-VECTORIZATION-NEXT: %scevgep485 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %111 +; AUTO-VECTORIZATION-NEXT: %scevgep504 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %112 +; AUTO-VECTORIZATION-NEXT: %scevgep523 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %113 +; AUTO-VECTORIZATION-NEXT: %scevgep542 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %114 +; AUTO-VECTORIZATION-NEXT: %scevgep561 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %115 +; AUTO-VECTORIZATION-NEXT: %scevgep580 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %116 +; AUTO-VECTORIZATION-NEXT: %scevgep599 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %117 +; AUTO-VECTORIZATION-NEXT: %scevgep618 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %118 +; AUTO-VECTORIZATION-NEXT: %179 = add nsw i64 %176, 3 +; AUTO-VECTORIZATION-NEXT: %scevgep637 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %111 +; AUTO-VECTORIZATION-NEXT: %scevgep656 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %112 +; AUTO-VECTORIZATION-NEXT: %scevgep675 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %113 +; AUTO-VECTORIZATION-NEXT: %scevgep694 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %114 +; AUTO-VECTORIZATION-NEXT: %scevgep713 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %115 +; AUTO-VECTORIZATION-NEXT: %scevgep732 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %116 +; AUTO-VECTORIZATION-NEXT: %scevgep751 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %117 +; AUTO-VECTORIZATION-NEXT: %scevgep770 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %118 +; AUTO-VECTORIZATION-NEXT: %180 = bitcast double* %scevgep to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %181 = load <4 x double>, <4 x double>* %180, align 8 +; AUTO-VECTORIZATION-NEXT: %182 = bitcast double* %scevgep257 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %183 = load <4 x double>, <4 x double>* %182, align 8 +; AUTO-VECTORIZATION-NEXT: %184 = bitcast double* %scevgep333 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %185 = load <4 x double>, <4 x double>* %184, align 8 +; AUTO-VECTORIZATION-NEXT: %186 = bitcast double* %scevgep409 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %187 = load <4 x double>, <4 x double>* %186, align 8 +; AUTO-VECTORIZATION-NEXT: %188 = bitcast double* %scevgep485 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %189 = load <4 x double>, <4 x double>* %188, align 8 +; AUTO-VECTORIZATION-NEXT: %190 = bitcast double* %scevgep561 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %191 = load <4 x double>, <4 x double>* %190, align 8 +; AUTO-VECTORIZATION-NEXT: %192 = bitcast double* %scevgep637 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %193 = load <4 x double>, <4 x double>* %192, align 8 +; AUTO-VECTORIZATION-NEXT: %194 = bitcast double* %scevgep713 to <4 x double>* +; AUTO-VECTORIZATION-NEXT: %195 = load <4 x double>, <4 x double>* %194, align 8 +; AUTO-VECTORIZATION-NEXT: br label %polly.loop_header164 +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %cmp39 = icmp sgt i32 %ni, 0 + br i1 %cmp39, label %for.cond1.preheader.lr.ph, label %for.end22 + +for.cond1.preheader.lr.ph: ; preds = %entry.split + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc20, %for.cond1.preheader.lr.ph + %indvars.iv45 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next46, %for.inc20 ] + %cmp237 = icmp sgt i32 %nj, 0 + br i1 %cmp237, label %for.cond4.preheader.lr.ph, label %for.inc20 + +for.cond4.preheader.lr.ph: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc17, %for.cond4.preheader.lr.ph + %indvars.iv41 = phi i64 [ 0, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next42, %for.inc17 ] + %cmp535 = icmp sgt i32 %nk, 0 + br i1 %cmp535, label %for.body6.lr.ph, label %for.inc17 + +for.body6.lr.ph: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6, %for.body6.lr.ph + %indvars.iv = phi i64 [ 0, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ] + %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv45, i64 %indvars.iv + %tmp = load double, double* %arrayidx8, align 8 + %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv41 + %tmp1 = load double, double* %arrayidx12, align 8 + %mul = fmul double %tmp, %tmp1 + %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv45, i64 %indvars.iv41 + %tmp2 = load double, double* %arrayidx16, align 8 + %add = fadd double %tmp2, %mul + store double %add, double* %arrayidx16, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %nk to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body6, label %for.cond4.for.inc17_crit_edge + +for.cond4.for.inc17_crit_edge: ; preds = %for.body6 + br label %for.inc17 + +for.inc17: ; preds = %for.cond4.for.inc17_crit_edge, %for.cond4.preheader + %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1 + %wide.trip.count43 = zext i32 %nj to i64 + %exitcond44 = icmp ne i64 %indvars.iv.next42, %wide.trip.count43 + br i1 %exitcond44, label %for.cond4.preheader, label %for.cond1.for.inc20_crit_edge + +for.cond1.for.inc20_crit_edge: ; preds = %for.inc17 + br label %for.inc20 + +for.inc20: ; preds = %for.cond1.for.inc20_crit_edge, %for.cond1.preheader + %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 + %wide.trip.count47 = zext i32 %ni to i64 + %exitcond48 = icmp ne i64 %indvars.iv.next46, %wide.trip.count47 + br i1 %exitcond48, label %for.cond1.preheader, label %for.cond.for.end22_crit_edge + +for.cond.for.end22_crit_edge: ; preds = %for.inc20 + br label %for.end22 + +for.end22: ; preds = %for.cond.for.end22_crit_edge, %entry.split + ret void +} + +attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }