Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -238,14 +238,22 @@
 /// Create an isl_union_set, which describes the isolate option based on
 /// IsoalteDomain.
 ///
-/// @param IsolateDomain An isl_set whose last dimension is the only one that
-///                      should belong to the current band node.
+/// @param IsolateDomain An isl_set whose @p OutDimsNum last dimensions should
+///                      belong to the current band node.
+/// @param OutDimsNum    A number of dimensions that should belong to
+///                      the current band node.
 static __isl_give isl_union_set *
-getIsolateOptions(__isl_take isl_set *IsolateDomain) {
+getIsolateOptions(__isl_take isl_set *IsolateDomain, unsigned OutDimsNum) {
   auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
+  assert(OutDimsNum <= Dims &&
+         "The isl_set IsolateDomain is used to describe the range of schedule "
+         "dimensions values, which should be isolated. Consequently, the "
+         "number of its dimensions should be greater than or equal to the "
+         "number of the schedule dimensions.");
   auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
-  IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
-                                      isl_dim_in, Dims - 1, 1);
+  IsolateRelation =
+      isl_map_move_dims(IsolateRelation, isl_dim_out, 0, isl_dim_in,
+                        Dims - OutDimsNum, OutDimsNum);
   auto *IsolateOption = isl_map_wrap(IsolateRelation);
   auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
   return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
@@ -264,6 +272,22 @@
   return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
 }
 
+/// Create an isl_union_set, which describes the option of the form
+/// [isolate[] -> unroll[x]].
+///
+/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
+static __isl_give isl_union_set *getUnrollIsolatedSetOptions(isl_ctx *Ctx) {
+  auto *Space = isl_space_alloc(Ctx, 0, 0, 1);
+  auto *UnrollIsolatedSetOption = isl_map_universe(Space);
+  auto *DimInId = isl_id_alloc(Ctx, "isolate", nullptr);
+  auto *DimOutId = isl_id_alloc(Ctx, "unroll", nullptr);
+  UnrollIsolatedSetOption =
+      isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_in, DimInId);
+  UnrollIsolatedSetOption =
+      isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_out, DimOutId);
+  return isl_union_set_from_set(isl_map_wrap(UnrollIsolatedSetOption));
+}
+
 /// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
 ///
 /// @param Set         A set, which should be modified.
@@ -324,7 +348,7 @@
   auto *ScheduleRange = isl_map_range(ScheduleRelation);
   auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
   auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
-  auto *IsolateOption = getIsolateOptions(IsolateDomain);
+  auto *IsolateOption = getIsolateOptions(IsolateDomain, 1);
   Node = isl_schedule_node_parent(Node);
   Node = isl_schedule_node_parent(Node);
   auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
@@ -1119,6 +1143,47 @@
   return MapOldIndVar;
 }
 
+/// Isolate a set of partial tile prefixes and unroll the isolated part.
+///
+/// The set should ensure that it contains only partial tile prefixes that have
+/// exactly Mr x Nr iterations of the two innermost loops produced by
+/// the optimization of the matrix multiplication. Mr and Nr are parameters of
+/// the micro-kernel.
+///
+/// In case of parametric bounds, this helps to auto-vectorize the unrolled
+/// innermost loops, using the SLP vectorizer.
+///
+/// @param Node              The schedule node to be modified.
+/// @param MicroKernelParams Parameters of the micro-kernel
+///                          to be taken into account.
+/// @return The modified isl_schedule_node.
+static __isl_give isl_schedule_node *
+isolateAndUnrollMatMulInnerLoops(__isl_take isl_schedule_node *Node,
+                                 struct MicroKernelParamsTy MicroKernelParams) {
+  auto *Child = isl_schedule_node_get_child(Node, 0);
+  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_relation(Child);
+  isl_schedule_node_free(Child);
+  auto *Prefix = isl_map_range(isl_map_from_union_map(UnMapOldIndVar));
+  auto Dims = isl_set_dim(Prefix, isl_dim_set);
+  Prefix = isl_set_project_out(Prefix, isl_dim_set, Dims - 1, 1);
+  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
+  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
+  auto *IsolateOption = getIsolateOptions(
+      isl_set_add_dims(isl_set_copy(Prefix), isl_dim_set, 3), 3);
+  auto *Ctx = isl_schedule_node_get_ctx(Node);
+  auto *AtomicOption = getAtomicOptions(Ctx);
+  auto *Options =
+      isl_union_set_union(IsolateOption, isl_union_set_copy(AtomicOption));
+  Options = isl_union_set_union(Options, getUnrollIsolatedSetOptions(Ctx));
+  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
+  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
+  IsolateOption = getIsolateOptions(Prefix, 3);
+  Options = isl_union_set_union(IsolateOption, AtomicOption);
+  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
+  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
+  return Node;
+}
+
 __isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
     __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI,
     MatMulInfoTy &MMI) {
@@ -1144,6 +1209,7 @@
       Node, MicroKernelParams, MacroKernelParams);
   if (!MapOldIndVar)
     return Node;
+  Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
   return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
                                           MacroKernelParams, MMI);
 }
@@ -1179,7 +1245,7 @@
   MatMulInfoTy MMI;
   if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) {
     DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
-    Node = optimizeMatMulPattern(Node, OAI->TTI, MMI);
+    return optimizeMatMulPattern(Node, OAI->TTI, MMI);
   }
 
   return standardBandOpts(Node, User);
Index: test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
===================================================================
--- test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -42,8 +42,6 @@
 ; CHECK-NEXT:            for (int c4 = 0; c4 <= 23; c4 += 1)
 ; CHECK-NEXT:              for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) {
 ; CHECK-NEXT:                // Register tiling - Points
-; CHECK-NEXT:                // 1st level tiling - Tiles
-; CHECK-NEXT:                // 1st level tiling - Points
 ; CHECK-NEXT:                {
 ; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
 ; CHECK-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
Index: test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
===================================================================
--- test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -38,8 +38,6 @@
 ; CHECK-NEXT:        for (int c1 = 0; c1 <= 263; c1 += 1)
 ; CHECK-NEXT:          for (int c2 = 0; c2 <= 1023; c2 += 1) {
 ; CHECK-NEXT:            // Register tiling - Points
-; CHECK-NEXT:            // 1st level tiling - Tiles
-; CHECK-NEXT:            // 1st level tiling - Points
 ; CHECK-NEXT:            {
 ; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0, c2);
 ; CHECK-NEXT:              Stmt_Copy_0(4 * c1, 8 * c0 + 1, c2);
@@ -101,8 +99,6 @@
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:            for (int c4 = 0; c4 <= 23; c4 += 1)
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:              for (int c5 = 0; c5 <= 255; c5 += 1) {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // Register tiling - Points
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:                // 1st level tiling - Tiles
-; EXTRACTION-OF-MACRO-KERNEL-NEXT:               // 1st level tiling - Points
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5);
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:                  Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5);
Index: test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
@@ -0,0 +1,363 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 < %s \
+; RUN: | FileCheck %s
+;
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-codegen -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 -gvn -licm -slp-vectorizer \
+; RUN: -S < %s | FileCheck %s --check-prefix=AUTO-VECTORIZATION
+;
+;
+;    /* We isolate a set of partial tile prefixes, which contains only partial
+;       tile prefixes that have exactly Mr x Nr iterations of the two innermost
+;       loops produced by the optimization of the matrix multiplication. Mr and
+;       Nr are parameters of the micro-kernel (see getMicroKernelParams and
+;       getMacroKernelParams from lib/Transform/ScheduleOptimizer.cpp for
+;       details). This test check that in case of parametric bounds it helps
+;       auto-vectorize the unrolled innermost loops, using the SLP
+;       vectorizer. */
+;    /* C := A * B + C */
+;    for (i = 0; i < _PB_NI; i++)
+;      for (j = 0; j < _PB_NJ; j++)
+;	 for (k = 0; k < _PB_NK; ++k)
+;	   C[i][j] += A[i][k] * B[k][j];
+;
+; CHECK:    if (ni >= 1) {
+; CHECK-NEXT:      // 1st level tiling - Tiles
+; CHECK-NEXT:      for (int c0 = 0; c0 <= floord(nj - 1, 2048); c0 += 1)
+; CHECK-NEXT:        for (int c1 = 0; c1 <= floord(nk - 1, 256); c1 += 1) {
+; CHECK-NEXT:          for (int c3 = 2048 * c0; c3 <= min(nj - 1, 2048 * c0 + 2047); c3 += 1)
+; CHECK-NEXT:            for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1)
+; CHECK-NEXT:              CopyStmt_0(0, c3, c4);
+; CHECK-NEXT:          for (int c2 = 0; c2 <= floord(ni - 1, 96); c2 += 1) {
+; CHECK-NEXT:            if (c0 == 0)
+; CHECK-NEXT:              for (int c3 = 96 * c2; c3 <= min(ni - 1, 96 * c2 + 95); c3 += 1)
+; CHECK-NEXT:                for (int c5 = 256 * c1; c5 <= min(nk - 1, 256 * c1 + 255); c5 += 1)
+; CHECK-NEXT:                  CopyStmt_1(c3, 0, c5);
+; CHECK-NEXT:            // 1st level tiling - Points
+; CHECK-NEXT:            // Register tiling - Tiles
+; CHECK-NEXT:            {
+; CHECK-NEXT:              if (ni >= 96 * c2 + 4)
+; CHECK-NEXT:                for (int c3 = 0; c3 <= min(255, -256 * c0 + nj / 8 - 1); c3 += 1) {
+; CHECK-NEXT:                  for (int c4 = 0; c4 <= min(23, -24 * c2 + ni / 4 - 1); c4 += 1)
+; CHECK-NEXT:                    for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) {
+; CHECK-NEXT:                      // Register tiling - Points
+; CHECK-NEXT:                      {
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5);
+; CHECK-NEXT:                        Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5);
+; CHECK-NEXT:                      }
+; CHECK-NEXT:                    }
+; CHECK-NEXT:                  if (96 * c2 + 95 >= ni)
+; CHECK-NEXT:                    for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) {
+; CHECK-NEXT:                      // Register tiling - Points
+; CHECK-NEXT:                      for (int c6 = 0; c6 < ni % 4; c6 += 1)
+; CHECK-NEXT:                        for (int c7 = 0; c7 <= 7; c7 += 1)
+; CHECK-NEXT:                          Stmt_for_body6(-((ni + 4) % 4) + ni + c6, 2048 * c0 + 8 * c3 + c7, 256 * c1 + c5);
+; CHECK-NEXT:                    }
+; CHECK-NEXT:                }
+; CHECK-NEXT:              if (96 * c2 + 3 >= ni || (2048 * c0 + 2047 >= nj && nj % 8 >= 1))
+; CHECK-NEXT:                for (int c3 = 0; c3 <= min(255, -256 * c0 + (nj - 1) / 8); c3 += 1)
+; CHECK-NEXT:                  if (96 * c2 + 3 >= ni || 2048 * c0 + 8 * c3 + 7 >= nj)
+; CHECK-NEXT:                    for (int c4 = 0; c4 <= min(23, -24 * c2 + (ni - 1) / 4); c4 += 1)
+; CHECK-NEXT:                      if ((ni >= 96 * c2 + 4 && 2048 * c0 + 8 * c3 + 7 >= nj) || 1)
+; CHECK-NEXT:                        for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) {
+; CHECK-NEXT:                          // Register tiling - Points
+; CHECK-NEXT:                          for (int c6 = 0; c6 <= min(3, ni - 96 * c2 - 4 * c4 - 1); c6 += 1)
+; CHECK-NEXT:                            for (int c7 = 0; c7 <= min(7, nj - 2048 * c0 - 8 * c3 - 1); c7 += 1)
+; CHECK-NEXT:                              Stmt_for_body6(96 * c2 + 4 * c4 + c6, 2048 * c0 + 8 * c3 + c7, 256 * c1 + c5);
+; CHECK-NEXT:                        }
+; CHECK-NEXT:            }
+; CHECK-NEXT:          }
+; CHECK-NEXT:        }
+; CHECK-NEXT:    }
+;
+
+; AUTO-VECTORIZATION:polly.loop_exit166.loopexit:                      ; preds = %polly.loop_header164
+; AUTO-VECTORIZATION-NEXT:  %94 = phi <4 x double> [ %134, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %95 = phi <4 x double> [ %138, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %96 = phi <4 x double> [ %144, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %97 = phi <4 x double> [ %146, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %98 = phi <4 x double> [ %154, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %99 = phi <4 x double> [ %166, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %100 = phi <4 x double> [ %172, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %101 = phi <4 x double> [ %174, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %102 = bitcast double* %scevgep to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %94, <4 x double>* %102, align 8
+; AUTO-VECTORIZATION-NEXT:  %103 = bitcast double* %scevgep257 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %95, <4 x double>* %103, align 8
+; AUTO-VECTORIZATION-NEXT:  %104 = bitcast double* %scevgep333 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %96, <4 x double>* %104, align 8
+; AUTO-VECTORIZATION-NEXT:  %105 = bitcast double* %scevgep409 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %97, <4 x double>* %105, align 8
+; AUTO-VECTORIZATION-NEXT:  %106 = bitcast double* %scevgep485 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %98, <4 x double>* %106, align 8
+; AUTO-VECTORIZATION-NEXT:  %107 = bitcast double* %scevgep561 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %99, <4 x double>* %107, align 8
+; AUTO-VECTORIZATION-NEXT:  %108 = bitcast double* %scevgep637 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %100, <4 x double>* %108, align 8
+; AUTO-VECTORIZATION-NEXT:  %109 = bitcast double* %scevgep713 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  store <4 x double> %101, <4 x double>* %109, align 8
+; AUTO-VECTORIZATION-NEXT:  br label %polly.loop_exit166
+
+; AUTO-VECTORIZATION:polly.loop_header164:                             ; preds = %polly.loop_header164, %polly.loop_preheader165
+; AUTO-VECTORIZATION-NEXT:  %polly.indvar168 = phi i64 [ 0, %polly.loop_preheader165 ], [ %polly.indvar_next169, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %119 = phi <4 x double> [ %181, %polly.loop_preheader165 ], [ %134, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %120 = phi <4 x double> [ %183, %polly.loop_preheader165 ], [ %138, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %121 = phi <4 x double> [ %185, %polly.loop_preheader165 ], [ %144, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %122 = phi <4 x double> [ %187, %polly.loop_preheader165 ], [ %146, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %123 = phi <4 x double> [ %189, %polly.loop_preheader165 ], [ %154, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %124 = phi <4 x double> [ %191, %polly.loop_preheader165 ], [ %166, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %125 = phi <4 x double> [ %193, %polly.loop_preheader165 ], [ %172, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %126 = phi <4 x double> [ %195, %polly.loop_preheader165 ], [ %174, %polly.loop_header164 ]
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_A174 = add nsw i64 %polly.access.mul.Packed_A173, %polly.indvar168
+; AUTO-VECTORIZATION-NEXT:  %polly.access.mul.Packed_A175 = mul nsw i64 %polly.access.add.Packed_A174, 4
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_A177 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.mul.Packed_A175
+; AUTO-VECTORIZATION-NEXT:  %tmp_p_scalar_ = load double, double* %polly.access.Packed_A177, align 8
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B180 = add nsw i64 %polly.access.mul.Packed_B179, %polly.indvar168
+; AUTO-VECTORIZATION-NEXT:  %polly.access.mul.Packed_B181 = mul nsw i64 %polly.access.add.Packed_B180, 8
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B183 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.mul.Packed_B181
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B196 = add nsw i64 %polly.access.mul.Packed_B181, 1
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B197 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B196
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B215 = add nsw i64 %polly.access.mul.Packed_B181, 2
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B216 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B215
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B234 = add nsw i64 %polly.access.mul.Packed_B181, 3
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B235 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B234
+; AUTO-VECTORIZATION-NEXT:  %127 = bitcast double* %polly.access.Packed_B183 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %128 = load <4 x double>, <4 x double>* %127, align 8
+; AUTO-VECTORIZATION-NEXT:  %129 = insertelement <4 x double> undef, double %tmp_p_scalar_, i32 0
+; AUTO-VECTORIZATION-NEXT:  %130 = insertelement <4 x double> %129, double %tmp_p_scalar_, i32 1
+; AUTO-VECTORIZATION-NEXT:  %131 = insertelement <4 x double> %130, double %tmp_p_scalar_, i32 2
+; AUTO-VECTORIZATION-NEXT:  %132 = insertelement <4 x double> %131, double %tmp_p_scalar_, i32 3
+; AUTO-VECTORIZATION-NEXT:  %133 = fmul <4 x double> %132, %128
+; AUTO-VECTORIZATION-NEXT:  %134 = fadd <4 x double> %119, %133
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B253 = add nsw i64 %polly.access.mul.Packed_B181, 4
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B254 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B253
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B272 = add nsw i64 %polly.access.mul.Packed_B181, 5
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B273 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B272
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B291 = add nsw i64 %polly.access.mul.Packed_B181, 6
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B292 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B291
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_B310 = add nsw i64 %polly.access.mul.Packed_B181, 7
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_B311 = getelementptr double, double* %polly.access.cast.Packed_B178, i64 %polly.access.add.Packed_B310
+; AUTO-VECTORIZATION-NEXT:  %135 = bitcast double* %polly.access.Packed_B254 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %136 = load <4 x double>, <4 x double>* %135, align 8
+; AUTO-VECTORIZATION-NEXT:  %137 = fmul <4 x double> %132, %136
+; AUTO-VECTORIZATION-NEXT:  %138 = fadd <4 x double> %120, %137
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_A322 = add nsw i64 %polly.access.mul.Packed_A175, 1
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_A323 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.add.Packed_A322
+; AUTO-VECTORIZATION-NEXT:  %tmp_p_scalar_324 = load double, double* %polly.access.Packed_A323, align 8
+; AUTO-VECTORIZATION-NEXT:  %139 = insertelement <4 x double> undef, double %tmp_p_scalar_324, i32 0
+; AUTO-VECTORIZATION-NEXT:  %140 = insertelement <4 x double> %139, double %tmp_p_scalar_324, i32 1
+; AUTO-VECTORIZATION-NEXT:  %141 = insertelement <4 x double> %140, double %tmp_p_scalar_324, i32 2
+; AUTO-VECTORIZATION-NEXT:  %142 = insertelement <4 x double> %141, double %tmp_p_scalar_324, i32 3
+; AUTO-VECTORIZATION-NEXT:  %143 = fmul <4 x double> %142, %128
+; AUTO-VECTORIZATION-NEXT:  %144 = fadd <4 x double> %121, %143
+; AUTO-VECTORIZATION-NEXT:  %145 = fmul <4 x double> %142, %136
+; AUTO-VECTORIZATION-NEXT:  %146 = fadd <4 x double> %122, %145
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_A474 = add nsw i64 %polly.access.mul.Packed_A175, 2
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_A475 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.add.Packed_A474
+; AUTO-VECTORIZATION-NEXT:  %tmp_p_scalar_476 = load double, double* %polly.access.Packed_A475, align 8
+; AUTO-VECTORIZATION-NEXT:  %147 = bitcast double* %polly.access.Packed_B183 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %148 = load <4 x double>, <4 x double>* %147, align 8
+; AUTO-VECTORIZATION-NEXT:  %149 = insertelement <4 x double> undef, double %tmp_p_scalar_476, i32 0
+; AUTO-VECTORIZATION-NEXT:  %150 = insertelement <4 x double> %149, double %tmp_p_scalar_476, i32 1
+; AUTO-VECTORIZATION-NEXT:  %151 = insertelement <4 x double> %150, double %tmp_p_scalar_476, i32 2
+; AUTO-VECTORIZATION-NEXT:  %152 = insertelement <4 x double> %151, double %tmp_p_scalar_476, i32 3
+; AUTO-VECTORIZATION-NEXT:  %153 = fmul <4 x double> %152, %148
+; AUTO-VECTORIZATION-NEXT:  %154 = fadd <4 x double> %123, %153
+; AUTO-VECTORIZATION-NEXT:  %155 = extractelement <4 x double> %136, i32 2
+; AUTO-VECTORIZATION-NEXT:  %156 = extractelement <4 x double> %136, i32 3
+; AUTO-VECTORIZATION-NEXT:  %157 = bitcast double* %polly.access.Packed_B254 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %158 = load <4 x double>, <4 x double>* %157, align 8
+; AUTO-VECTORIZATION-NEXT:  %159 = extractelement <4 x double> %158, i32 0
+; AUTO-VECTORIZATION-NEXT:  %160 = insertelement <4 x double> undef, double %159, i32 0
+; AUTO-VECTORIZATION-NEXT:  %161 = extractelement <4 x double> %158, i32 1
+; AUTO-VECTORIZATION-NEXT:  %162 = insertelement <4 x double> %160, double %161, i32 1
+; AUTO-VECTORIZATION-NEXT:  %163 = insertelement <4 x double> %162, double %155, i32 2
+; AUTO-VECTORIZATION-NEXT:  %164 = insertelement <4 x double> %163, double %156, i32 3
+; AUTO-VECTORIZATION-NEXT:  %165 = fmul <4 x double> %152, %164
+; AUTO-VECTORIZATION-NEXT:  %166 = fadd <4 x double> %124, %165
+; AUTO-VECTORIZATION-NEXT:  %polly.access.add.Packed_A626 = add nsw i64 %polly.access.mul.Packed_A175, 3
+; AUTO-VECTORIZATION-NEXT:  %polly.access.Packed_A627 = getelementptr double, double* %polly.access.cast.Packed_A172, i64 %polly.access.add.Packed_A626
+; AUTO-VECTORIZATION-NEXT:  %tmp_p_scalar_628 = load double, double* %polly.access.Packed_A627, align 8
+; AUTO-VECTORIZATION-NEXT:  %167 = insertelement <4 x double> undef, double %tmp_p_scalar_628, i32 0
+; AUTO-VECTORIZATION-NEXT:  %168 = insertelement <4 x double> %167, double %tmp_p_scalar_628, i32 1
+; AUTO-VECTORIZATION-NEXT:  %169 = insertelement <4 x double> %168, double %tmp_p_scalar_628, i32 2
+; AUTO-VECTORIZATION-NEXT:  %170 = insertelement <4 x double> %169, double %tmp_p_scalar_628, i32 3
+; AUTO-VECTORIZATION-NEXT:  %171 = fmul <4 x double> %170, %148
+; AUTO-VECTORIZATION-NEXT:  %172 = fadd <4 x double> %125, %171
+; AUTO-VECTORIZATION-NEXT:  %173 = fmul <4 x double> %170, %158
+; AUTO-VECTORIZATION-NEXT:  %174 = fadd <4 x double> %126, %173
+; AUTO-VECTORIZATION-NEXT:  %polly.indvar_next169 = add nsw i64 %polly.indvar168, 1
+; AUTO-VECTORIZATION-NEXT:  %polly.loop_cond171 = icmp sle i64 %polly.indvar168, %polly.adjust_ub170
+; AUTO-VECTORIZATION-NEXT:  br i1 %polly.loop_cond171, label %polly.loop_header164, label %polly.loop_exit166.loopexit
+
+; AUTO-VECTORIZATION:polly.loop_preheader165:                          ; preds = %polly.loop_header155
+; AUTO-VECTORIZATION-NEXT:  %175 = mul nsw i64 4, %polly.indvar159
+; AUTO-VECTORIZATION-NEXT:  %176 = add nsw i64 %.pre-phi, %175
+; AUTO-VECTORIZATION-NEXT:  %polly.access.mul.Packed_A173 = mul nsw i64 %polly.indvar159, 256
+; AUTO-VECTORIZATION-NEXT:  %scevgep = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %111
+; AUTO-VECTORIZATION-NEXT:  %scevgep200 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %112
+; AUTO-VECTORIZATION-NEXT:  %scevgep219 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %113
+; AUTO-VECTORIZATION-NEXT:  %scevgep238 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %114
+; AUTO-VECTORIZATION-NEXT:  %scevgep257 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %115
+; AUTO-VECTORIZATION-NEXT:  %scevgep276 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %116
+; AUTO-VECTORIZATION-NEXT:  %scevgep295 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %117
+; AUTO-VECTORIZATION-NEXT:  %scevgep314 = getelementptr [1024 x double], [1024 x double]* %C, i64 %176, i64 %118
+; AUTO-VECTORIZATION-NEXT:  %177 = add nsw i64 %176, 1
+; AUTO-VECTORIZATION-NEXT:  %scevgep333 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %111
+; AUTO-VECTORIZATION-NEXT:  %scevgep352 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %112
+; AUTO-VECTORIZATION-NEXT:  %scevgep371 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %113
+; AUTO-VECTORIZATION-NEXT:  %scevgep390 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %114
+; AUTO-VECTORIZATION-NEXT:  %scevgep409 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %115
+; AUTO-VECTORIZATION-NEXT:  %scevgep428 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %116
+; AUTO-VECTORIZATION-NEXT:  %scevgep447 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %117
+; AUTO-VECTORIZATION-NEXT:  %scevgep466 = getelementptr [1024 x double], [1024 x double]* %C, i64 %177, i64 %118
+; AUTO-VECTORIZATION-NEXT:  %178 = add nsw i64 %176, 2
+; AUTO-VECTORIZATION-NEXT:  %scevgep485 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %111
+; AUTO-VECTORIZATION-NEXT:  %scevgep504 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %112
+; AUTO-VECTORIZATION-NEXT:  %scevgep523 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %113
+; AUTO-VECTORIZATION-NEXT:  %scevgep542 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %114
+; AUTO-VECTORIZATION-NEXT:  %scevgep561 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %115
+; AUTO-VECTORIZATION-NEXT:  %scevgep580 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %116
+; AUTO-VECTORIZATION-NEXT:  %scevgep599 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %117
+; AUTO-VECTORIZATION-NEXT:  %scevgep618 = getelementptr [1024 x double], [1024 x double]* %C, i64 %178, i64 %118
+; AUTO-VECTORIZATION-NEXT:  %179 = add nsw i64 %176, 3
+; AUTO-VECTORIZATION-NEXT:  %scevgep637 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %111
+; AUTO-VECTORIZATION-NEXT:  %scevgep656 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %112
+; AUTO-VECTORIZATION-NEXT:  %scevgep675 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %113
+; AUTO-VECTORIZATION-NEXT:  %scevgep694 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %114
+; AUTO-VECTORIZATION-NEXT:  %scevgep713 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %115
+; AUTO-VECTORIZATION-NEXT:  %scevgep732 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %116
+; AUTO-VECTORIZATION-NEXT:  %scevgep751 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %117
+; AUTO-VECTORIZATION-NEXT:  %scevgep770 = getelementptr [1024 x double], [1024 x double]* %C, i64 %179, i64 %118
+; AUTO-VECTORIZATION-NEXT:  %180 = bitcast double* %scevgep to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %181 = load <4 x double>, <4 x double>* %180, align 8
+; AUTO-VECTORIZATION-NEXT:  %182 = bitcast double* %scevgep257 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %183 = load <4 x double>, <4 x double>* %182, align 8
+; AUTO-VECTORIZATION-NEXT:  %184 = bitcast double* %scevgep333 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %185 = load <4 x double>, <4 x double>* %184, align 8
+; AUTO-VECTORIZATION-NEXT:  %186 = bitcast double* %scevgep409 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %187 = load <4 x double>, <4 x double>* %186, align 8
+; AUTO-VECTORIZATION-NEXT:  %188 = bitcast double* %scevgep485 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %189 = load <4 x double>, <4 x double>* %188, align 8
+; AUTO-VECTORIZATION-NEXT:  %190 = bitcast double* %scevgep561 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %191 = load <4 x double>, <4 x double>* %190, align 8
+; AUTO-VECTORIZATION-NEXT:  %192 = bitcast double* %scevgep637 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %193 = load <4 x double>, <4 x double>* %192, align 8
+; AUTO-VECTORIZATION-NEXT:  %194 = bitcast double* %scevgep713 to <4 x double>*
+; AUTO-VECTORIZATION-NEXT:  %195 = load <4 x double>, <4 x double>* %194, align 8
+; AUTO-VECTORIZATION-NEXT:  br label %polly.loop_header164
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %cmp39 = icmp sgt i32 %ni, 0
+  br i1 %cmp39, label %for.cond1.preheader.lr.ph, label %for.end22
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry.split
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc20, %for.cond1.preheader.lr.ph
+  %indvars.iv45 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next46, %for.inc20 ]
+  %cmp237 = icmp sgt i32 %nj, 0
+  br i1 %cmp237, label %for.cond4.preheader.lr.ph, label %for.inc20
+
+for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.preheader
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc17, %for.cond4.preheader.lr.ph
+  %indvars.iv41 = phi i64 [ 0, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next42, %for.inc17 ]
+  %cmp535 = icmp sgt i32 %nk, 0
+  br i1 %cmp535, label %for.body6.lr.ph, label %for.inc17
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv45, i64 %indvars.iv
+  %tmp = load double, double* %arrayidx8, align 8
+  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv41
+  %tmp1 = load double, double* %arrayidx12, align 8
+  %mul = fmul double %tmp, %tmp1
+  %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv45, i64 %indvars.iv41
+  %tmp2 = load double, double* %arrayidx16, align 8
+  %add = fadd double %tmp2, %mul
+  store double %add, double* %arrayidx16, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %nk to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body6, label %for.cond4.for.inc17_crit_edge
+
+for.cond4.for.inc17_crit_edge:                    ; preds = %for.body6
+  br label %for.inc17
+
+for.inc17:                                        ; preds = %for.cond4.for.inc17_crit_edge, %for.cond4.preheader
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %wide.trip.count43 = zext i32 %nj to i64
+  %exitcond44 = icmp ne i64 %indvars.iv.next42, %wide.trip.count43
+  br i1 %exitcond44, label %for.cond4.preheader, label %for.cond1.for.inc20_crit_edge
+
+for.cond1.for.inc20_crit_edge:                    ; preds = %for.inc17
+  br label %for.inc20
+
+for.inc20:                                        ; preds = %for.cond1.for.inc20_crit_edge, %for.cond1.preheader
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %wide.trip.count47 = zext i32 %ni to i64
+  %exitcond48 = icmp ne i64 %indvars.iv.next46, %wide.trip.count47
+  br i1 %exitcond48, label %for.cond1.preheader, label %for.cond.for.end22_crit_edge
+
+for.cond.for.end22_crit_edge:                     ; preds = %for.inc20
+  br label %for.end22
+
+for.end22:                                        ; preds = %for.cond.for.end22_crit_edge, %entry.split
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }