Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -160,6 +160,91 @@
                       cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                       cl::cat(PollyCategory));
 
+static __isl_give isl_union_set *
+getIsolateOptions(__isl_take isl_set *IsolateDomain) {
+  auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
+  auto IsolateRange =
+      isl_set_project_out(isl_set_copy(IsolateDomain), isl_dim_set, 0, Dims);
+  auto IsolateDomainSpace = isl_set_get_space(IsolateDomain);
+  auto IsolateRangeSpace = isl_set_get_space(IsolateRange);
+  auto IsolateSpace = isl_space_map_from_domain_and_range(IsolateDomainSpace,
+                                                          IsolateRangeSpace);
+  auto IsolateRelation = isl_map_universe(IsolateSpace);
+  IsolateRelation = isl_map_intersect_domain(IsolateRelation, IsolateDomain);
+  IsolateRelation = isl_map_intersect_range(IsolateRelation, IsolateRange);
+  IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
+                                      isl_dim_in, Dims - 1, 1);
+  auto IsolateOption = isl_map_wrap(IsolateRelation);
+  auto Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", NULL);
+  return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
+}
+
+static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_space *Space) {
+  auto AtomicOption = isl_set_universe(Space);
+  AtomicOption = isl_set_add_dims(AtomicOption, isl_dim_set, 1);
+  auto Id = isl_id_alloc(isl_set_get_ctx(AtomicOption), "atomic", NULL);
+  return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
+}
+
+/// @brief Build the desired set of prefixes.
+///
+/// 1. Get all prefixes of the vector loop.
+/// 2. Extend it to a set, which has exactly VectorWidth iterations for
+///    an every prefix from the previous step.
+/// 3. Subtract loop domain from it, project out the vector loop dimension and
+///    get a set of prefixes, which don’t have exactly VectorWidth iterations.
+/// 4. Subtract it from all prefixes of the vector loop and get the desired
+///    set.
+///
+/// @param ScheduleRange A range of a map, which describes a prefix schedule
+///        relation.
+
+static __isl_give isl_set *getPrefixes(__isl_take isl_set *ScheduleRange) {
+  auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
+  auto LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
+                                          isl_dim_set, Dims - 1, 1);
+  auto ExtentConstraints = isl_set_project_out(isl_set_copy(ScheduleRange),
+                                               isl_dim_set, 0, Dims - 1);
+  auto DimsParam = isl_set_dim(ExtentConstraints, isl_dim_param);
+  ExtentConstraints = isl_set_drop_constraints_involving_dims(
+      ExtentConstraints, isl_dim_param, 0, DimsParam);
+  ExtentConstraints =
+      isl_set_insert_dims(ExtentConstraints, isl_dim_set, 0, Dims - 1);
+  auto ExtentPrefixes =
+      isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
+  ExtentPrefixes = isl_set_intersect(ExtentPrefixes, ExtentConstraints);
+  auto BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
+  BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
+  return isl_set_subtract(LoopPrefixes, BadPrefixes);
+}
+
+/// @brief Isolate a set of prefixes.
+///
+/// This set should ensure that any prefix from the set has exactly
+/// VectorWidth iterations.
+///
+/// @param Node A schedule node band, which is a parent of a band node,
+///             that contains a vector loop.
+
+static __isl_give isl_schedule_node *
+isolateFullPartialTiles(__isl_take isl_schedule_node *Node) {
+  if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
+    return Node;
+  Node = isl_schedule_node_child(Node, 0);
+  Node = isl_schedule_node_child(Node, 0);
+  auto SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
+  auto ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
+  auto ScheduleRange = isl_map_range(ScheduleRelation);
+  auto IsolateDomain = getPrefixes(ScheduleRange);
+  auto AtomicOption = getAtomicOptions(isl_set_get_space(IsolateDomain));
+  auto IsolateOption = getIsolateOptions(IsolateDomain);
+  Node = isl_schedule_node_parent(Node);
+  Node = isl_schedule_node_parent(Node);
+  auto Options = isl_union_set_union(IsolateOption, AtomicOption);
+  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
+  return Node;
+}
+
 __isl_give isl_schedule_node *
 ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
                                         unsigned DimToVectorize,
@@ -183,6 +268,7 @@
   Sizes =
       isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
   Node = isl_schedule_node_band_tile(Node, Sizes);
+  Node = isolateFullPartialTiles(Node);
   Node = isl_schedule_node_child(Node, 0);
   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
   // we will have troubles to match it in the backend.
Index: test/ScheduleOptimizer/full_partial_tile_separation.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/full_partial_tile_separation.ll
@@ -0,0 +1,92 @@
+; RUN: opt -S %loadPolly -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze < %s | FileCheck %s
+
+; CHECK:       // 1st level tiling - Tiles
+; CHECK:       #pragma known-parallel
+; CHECK:       for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
+; CHECK:         for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
+; CHECK:           for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
+; CHECK:             // 1st level tiling - Points
+; CHECK:             for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
+; CHECK:               for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
+; CHECK:                 for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
+; CHECK:                   #pragma simd
+; CHECK:                   for (int c6 = 0; c6 <= 3; c6 += 1)
+; CHECK:                     Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
+; CHECK:               if (nj >= 32 * c1 + 4 && 32 * c1 + 31 >= nj) {
+; CHECK:                 for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
+; CHECK:                   #pragma simd
+; CHECK:                   for (int c6 = 0; c6 < nj % 4; c6 += 1)
+; CHECK:                     Stmt_for_body_6(32 * c0 + c3, -((nj - 1) % 4) + nj + c6 - 1, 32 * c2 + c5);
+; CHECK:               } else if (32 * c1 + 3 >= nj)
+; CHECK:                 for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
+; CHECK:                   #pragma simd
+; CHECK:                   for (int c6 = 0; c6 < nj - 32 * c1; c6 += 1)
+; CHECK:                     Stmt_for_body_6(32 * c0 + c3, 32 * c1 + c6, 32 * c2 + c5);
+; CHECK:             }
+; CHECK:           }
+
+; Function Attrs: nounwind uwtable
+define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {
+entry:
+  %cmp.27 = icmp sgt i32 %ni, 0
+  br i1 %cmp.27, label %for.cond.1.preheader.lr.ph, label %for.end.22
+
+for.cond.1.preheader.lr.ph:                       ; preds = %entry
+  br label %for.cond.1.preheader
+
+for.cond.1.preheader:                             ; preds = %for.cond.1.preheader.lr.ph, %for.inc.20
+  %indvars.iv33 = phi i64 [ 0, %for.cond.1.preheader.lr.ph ], [ %indvars.iv.next34, %for.inc.20 ]
+  %cmp2.25 = icmp sgt i32 %nj, 0
+  br i1 %cmp2.25, label %for.cond.4.preheader.lr.ph, label %for.inc.20
+
+for.cond.4.preheader.lr.ph:                       ; preds = %for.cond.1.preheader
+  br label %for.cond.4.preheader
+
+for.cond.4.preheader:                             ; preds = %for.cond.4.preheader.lr.ph, %for.inc.17
+  %indvars.iv29 = phi i64 [ 0, %for.cond.4.preheader.lr.ph ], [ %indvars.iv.next30, %for.inc.17 ]
+  %cmp5.23 = icmp sgt i32 %nk, 0
+  br i1 %cmp5.23, label %for.body.6.lr.ph, label %for.inc.17
+
+for.body.6.lr.ph:                                 ; preds = %for.cond.4.preheader
+  br label %for.body.6
+
+for.body.6:                                       ; preds = %for.body.6.lr.ph, %for.body.6
+  %indvars.iv = phi i64 [ 0, %for.body.6.lr.ph ], [ %indvars.iv.next, %for.body.6 ]
+  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv33, i64 %indvars.iv
+  %0 = load double, double* %arrayidx8, align 8
+  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv29
+  %1 = load double, double* %arrayidx12, align 8
+  %mul = fmul double %0, %1
+  %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv33, i64 %indvars.iv29
+  %2 = load double, double* %arrayidx16, align 8
+  %add = fadd double %2, %mul
+  store double %add, double* %arrayidx16, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %nk
+  br i1 %exitcond, label %for.body.6, label %for.cond.4.for.inc.17_crit_edge
+
+for.cond.4.for.inc.17_crit_edge:                  ; preds = %for.body.6
+  br label %for.inc.17
+
+for.inc.17:                                       ; preds = %for.cond.4.for.inc.17_crit_edge, %for.cond.4.preheader
+  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp ne i32 %lftr.wideiv31, %nj
+  br i1 %exitcond32, label %for.cond.4.preheader, label %for.cond.1.for.inc.20_crit_edge
+
+for.cond.1.for.inc.20_crit_edge:                  ; preds = %for.inc.17
+  br label %for.inc.20
+
+for.inc.20:                                       ; preds = %for.cond.1.for.inc.20_crit_edge, %for.cond.1.preheader
+  %indvars.iv.next34 = add nuw nsw i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp ne i32 %lftr.wideiv35, %ni
+  br i1 %exitcond36, label %for.cond.1.preheader, label %for.cond.for.end.22_crit_edge
+
+for.cond.for.end.22_crit_edge:                    ; preds = %for.inc.20
+  br label %for.end.22
+
+for.end.22:                                       ; preds = %for.cond.for.end.22_crit_edge, %entry
+  ret void
+}