Index: lib/Transform/ScheduleOptimizer.cpp
===================================================================
--- lib/Transform/ScheduleOptimizer.cpp
+++ lib/Transform/ScheduleOptimizer.cpp
@@ -37,6 +37,9 @@
 
 #define DEBUG_TYPE "polly-opt-isl"
 
+STATISTIC(NumFusedLoops,  "Number of fused loops");
+STATISTIC(NumLoopFusions, "Number of loop fusions");
+
 namespace polly {
 bool DisablePollyTiling;
 }
@@ -194,6 +197,18 @@
 
   static isl_union_map *getScheduleMap(isl_schedule *Schedule);
 
+  /// @brief Report loop fusion statistics.
+  void reportFusedLoops(Scop &S, unsigned Dims);
+
+  /// @brief Determine if two statements are scheduled together.
+  bool doStmtsShareScatteringDim(ScopStmt *Stmt1, ScopStmt *Stmt2, unsigned D);
+
+  /// @brief Determine if a scattering dimension involves input dimensions.
+  bool doesScatteringInvolveInputDims(ScopStmt *Stmt, unsigned D);
+
+  /// @brief Get the outermost loop dimension for a given schedule dimension.
+  unsigned getOutermostLoopDim(ScopStmt *Stmt, unsigned D);
+
   using llvm::Pass::doFinalization;
 
   virtual bool doFinalization() {
@@ -585,9 +600,146 @@
     MaxScatDims = std::max(Stmt->getNumScattering(), MaxScatDims);
 
   extendScattering(S, MaxScatDims);
+
+  if (AreStatisticsEnabled() && FusionStrategy == "max")
+    reportFusedLoops(S, MaxScatDims);
+
   return false;
 }
 
+bool IslScheduleOptimizer::doesScatteringInvolveInputDims(ScopStmt *Stmt,
+    unsigned D) {
+
+  isl_pw_multi_aff *ScatPMA;
+  isl_pw_aff *ScatPA;
+
+  ScatPMA = isl_pw_multi_aff_from_map(Stmt->getScattering());
+  ScatPMA = isl_pw_multi_aff_gist(ScatPMA, Stmt->getDomain());
+  ScatPA  = isl_pw_multi_aff_get_pw_aff(ScatPMA, D);
+
+  bool Iterable = !isl_pw_aff_is_cst(ScatPA);
+
+  isl_pw_multi_aff_free(ScatPMA);
+  isl_pw_aff_free(ScatPA);
+  return Iterable;
+}
+
+unsigned IslScheduleOptimizer::getOutermostLoopDim(ScopStmt *Stmt, unsigned D) {
+
+  isl_pw_multi_aff *ScatPMA;
+  isl_pw_aff *ScatPA;
+
+  ScatPMA = isl_pw_multi_aff_from_map(Stmt->getScattering());
+  ScatPMA = isl_pw_multi_aff_gist(ScatPMA, Stmt->getDomain());
+  ScatPA  = isl_pw_multi_aff_get_pw_aff(ScatPMA, D);
+
+  unsigned Dimension = -1;
+
+  for (unsigned i = 0, e = Stmt->getNumIterators(); i != e; ++i)
+    if (isl_pw_aff_involves_dims(ScatPA, isl_dim_in, i, 1))
+      if (Stmt->getLoopForDimension(i)) {
+        Dimension = i;
+        break;
+      }
+
+  isl_pw_multi_aff_free(ScatPMA);
+  isl_pw_aff_free(ScatPA);
+  return Dimension;
+}
+
+bool IslScheduleOptimizer::doStmtsShareScatteringDim(ScopStmt *Stmt1, ScopStmt
+    *Stmt2, unsigned D) {
+
+  isl_set *S1Sched;
+  isl_set *S2Sched;
+
+  S1Sched = isl_map_range(Stmt1->getScattering());
+  S2Sched = isl_map_range(Stmt2->getScattering());
+
+  unsigned Dims = isl_set_dim(S1Sched, isl_dim_set);
+  assert(isl_set_dim(S2Sched, isl_dim_set) == Dims && "Schedule mismatch");
+
+  S1Sched = isl_set_project_out(S1Sched, isl_dim_set, D + 1, Dims - D - 1);
+  S2Sched = isl_set_project_out(S2Sched, isl_dim_set, D + 1, Dims - D - 1);
+
+  bool Fused = isl_set_plain_is_equal(S1Sched, S2Sched);
+
+  isl_set_free(S1Sched);
+  isl_set_free(S2Sched);
+  return Fused;
+}
+
+void IslScheduleOptimizer::reportFusedLoops(Scop &S, unsigned MaxScatDims) {
+
+  // Collect all the SCoP statements that may be fused.
+  SmallVector<ScopStmt *, 8> Stmts;
+  for (ScopStmt *Stmt : S)
+    Stmts.push_back(Stmt);
+
+  // If we only have one statement, there is no fusion.
+  if (Stmts.size() < 2)
+    return;
+
+  DEBUG(dbgs() << "\n");
+
+  // Keep track of all the loops that are fused in FusedLoops.
+  SmallPtrSet<const Loop *, 4> FusedLoops;
+
+  // Iterate over the scattering dimensions. All statements should have the
+  // same number of scattering dimensions.
+  SmallVectorImpl<ScopStmt *>::iterator Begin = Stmts.begin(), E = Stmts.end();
+  for (unsigned D = 0; D < MaxScatDims; ++D) {
+
+    // Ignore auxiliary scattering dimensions that do not involve the input
+    // dimensions. Constant dimensions are only used for statement ordering.
+    if (!doesScatteringInvolveInputDims(*Begin, D))
+      continue;
+
+    // Iterate over all statements I and perform comparisons for fusion.
+    for (SmallVectorImpl<ScopStmt *>::iterator I = Begin; I != E - 1; ++I) {
+
+      // Get the original loop corresponding to statement I. Note that this is
+      // not an exact mapping due to transformations that create additional
+      // scattering dimensions such as tiling. To work around this, we get the
+      // outermost original loop the current scattering dimension involves.
+      unsigned LoopDimension = getOutermostLoopDim(*I, D);
+      assert(LoopDimension >= 0 && "Expected positive loop dimension");
+
+      const Loop *LoopI = (*I)->getLoopForDimension(LoopDimension);
+
+      // If LoopI is in FusedLoops, it has already been counted as fused, so
+      // there is no need to consider it again.
+      if (FusedLoops.count(LoopI))
+        continue;
+
+      // Keep track of all loops fused with LoopI in LoopsJFusedWithI.
+      SmallPtrSet<const Loop *, 4> LoopsJFusedWithI;
+      LoopsJFusedWithI.insert(LoopI);
+
+      // Determine if LoopI is fused with any other other in the SCoP. Two
+      // loops are fused if they are not equal and their corresponding
+      // statements share the same scattering dimension.
+      for (SmallVectorImpl<ScopStmt *>::iterator J = I + 1; J != E; ++J)
+        if (doStmtsShareScatteringDim(*I, *J, D))
+          LoopsJFusedWithI.insert((*J)->getLoopForDimension(LoopDimension));
+
+      // If LoopsJFusedWithI contains more than one loop, they are all fused in
+      // the current LoopDimension. Record loop fusion statistics.
+      if (LoopsJFusedWithI.size() > 1) {
+        ++NumLoopFusions;
+        NumFusedLoops += LoopsJFusedWithI.size();
+        DEBUG(dbgs() << "Loop fusion " << NumLoopFusions << ": "
+                     << LoopsJFusedWithI.size() << " loops fused at depth "
+                     << LoopDimension + 1 << "\n");
+
+        // Save the fused loops to avoid extra computation.
+        FusedLoops.insert(LoopsJFusedWithI.begin(), LoopsJFusedWithI.end());
+      }
+    }
+  }
+  DEBUG(dbgs() << "\n");
+}
+
 void IslScheduleOptimizer::printScop(raw_ostream &OS) const {
   isl_printer *p;
   char *ScheduleStr;
Index: test/ScheduleOptimizer/loop-fusion-stats-01.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/loop-fusion-stats-01.ll
@@ -0,0 +1,76 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s
+;
+; // Original code
+; int A[100];
+; int fusion(int c, int d, int n) {
+;   for (int j = 0; j < n; j++)
+;     A[j] ^= c;
+;   for (int i = 0; i < n; i++)
+;     A[i] |= d;
+;   return 0;
+; }
+;
+; // AST of fused loops
+; for (int c0 = 0; c0 < n; c0 += 1) {
+;   Stmt_for_body1(c0);
+;   Stmt_for_body2(c0);
+; }
+;
+; // Loop fusion debugging info
+; CHECK: Loop fusion 1: 2 loops fused at depth 1
+;
+; // Loop fusion statistics
+; CHECK: 2 polly-opt-isl {{.*}} Number of fused loops
+; CHECK: 1 polly-opt-isl {{.*}} Number of loop fusions
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@A = common global [100 x i32] zeroinitializer, align 4
+
+define i32 @fusion(i32 %c, i32 %d, i32 %n) #0 {
+entry:
+  br label %entry.split
+
+entry.split:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.lr.ph, label %for.cond1.preheader
+
+for.body.lr.ph:
+  br label %for.body
+
+for.cond.for.cond1.preheader_crit_edge:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %cmp28 = icmp sgt i32 %n, 0
+  br i1 %cmp28, label %for.body3.lr.ph, label %for.end7
+
+for.body3.lr.ph:
+  br label %for.body3
+
+for.body:
+  %j.011 = phi i32 [ 0, %for.body.lr.ph ], [ %1, %for.body ]
+  %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %j.011
+  %0 = load i32* %arrayidx, align 4
+  %xor = xor i32 %0, %c
+  store i32 %xor, i32* %arrayidx, align 4
+  %1 = add nsw i32 %j.011, 1
+  %exitcond12 = icmp ne i32 %1, %n
+  br i1 %exitcond12, label %for.body, label %for.cond.for.cond1.preheader_crit_edge
+
+for.body3:
+  %i.09 = phi i32 [ 0, %for.body3.lr.ph ], [ %3, %for.body3 ]
+  %arrayidx4 = getelementptr [100 x i32]* @A, i32 0, i32 %i.09
+  %2 = load i32* %arrayidx4, align 4
+  %or = or i32 %2, %d
+  store i32 %or, i32* %arrayidx4, align 4
+  %3 = add nsw i32 %i.09, 1
+  %exitcond = icmp ne i32 %3, %n
+  br i1 %exitcond, label %for.body3, label %for.cond1.for.end7_crit_edge
+
+for.cond1.for.end7_crit_edge:
+  br label %for.end7
+
+for.end7:
+  ret i32 0
+}
Index: test/ScheduleOptimizer/loop-fusion-stats-02.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/loop-fusion-stats-02.ll
@@ -0,0 +1,97 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s
+;
+; // Original code
+; int A[100];
+; int fusion(int c, int d, int m, int n) {
+;   for (int k = 0; k < m; k++) {
+;     for (int j = 0; j < n; j++)
+;       A[k + j] ^= c;
+;     for (int i = 0; i < n; i++)
+;       A[k + i] |= d;
+;   }
+;   return 0;
+; }
+;
+; // AST of fused loops
+; for (int c0 = 0; c0 < m + n - 1; c0 += 1)
+;   for (int c1 = max(-n + 1, -c0); c1 <= min(0, m - c0 - 1); c1 += 1) {
+;     Stmt_for_body1(c0 + c1, -c1);
+;     Stmt_for_body2(c0 + c1, -c1);
+;   }
+;
+; // Loop fusion debugging info
+; CHECK: Loop fusion 1: 2 loops fused at depth 2
+;
+; // Loop fusion statistics
+; CHECK: 2 polly-opt-isl {{.*}} Number of fused loops
+; CHECK: 1 polly-opt-isl {{.*}} Number of loop fusions
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@A = common global [100 x i32] zeroinitializer, align 4
+
+define i32 @fusion(i32 %c, i32 %d, i32 %m, i32 %n) #0 {
+entry:
+  br label %entry.split
+
+entry.split:
+  %cmp19 = icmp sgt i32 %m, 0
+  br i1 %cmp19, label %for.cond1.preheader.lr.ph, label %for.end14
+
+for.cond1.preheader.lr.ph:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %k.020 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %6, %for.inc12 ]
+  %cmp215 = icmp sgt i32 %n, 0
+  br i1 %cmp215, label %for.body3.lr.ph, label %for.cond4.preheader
+
+for.body3.lr.ph:
+  br label %for.body3
+
+for.cond1.for.cond4.preheader_crit_edge:
+  br label %for.cond4.preheader
+
+for.cond4.preheader:
+  %cmp517 = icmp sgt i32 %n, 0
+  br i1 %cmp517, label %for.body6.lr.ph, label %for.inc12
+
+for.body6.lr.ph:
+  br label %for.body6
+
+for.body3:
+  %j.016 = phi i32 [ 0, %for.body3.lr.ph ], [ %2, %for.body3 ]
+  %0 = add i32 %k.020, %j.016
+  %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %0
+  %1 = load i32* %arrayidx, align 4
+  %xor = xor i32 %1, %c
+  store i32 %xor, i32* %arrayidx, align 4
+  %2 = add nsw i32 %j.016, 1
+  %exitcond = icmp ne i32 %2, %n
+  br i1 %exitcond, label %for.body3, label %for.cond1.for.cond4.preheader_crit_edge
+
+for.body6:
+  %i.018 = phi i32 [ 0, %for.body6.lr.ph ], [ %5, %for.body6 ]
+  %3 = add i32 %k.020, %i.018
+  %arrayidx8 = getelementptr [100 x i32]* @A, i32 0, i32 %3
+  %4 = load i32* %arrayidx8, align 4
+  %or = or i32 %4, %d
+  store i32 %or, i32* %arrayidx8, align 4
+  %5 = add nsw i32 %i.018, 1
+  %exitcond22 = icmp ne i32 %5, %n
+  br i1 %exitcond22, label %for.body6, label %for.cond4.for.inc12_crit_edge
+
+for.cond4.for.inc12_crit_edge:
+  br label %for.inc12
+
+for.inc12:
+  %6 = add nsw i32 %k.020, 1
+  %exitcond23 = icmp ne i32 %6, %m
+  br i1 %exitcond23, label %for.cond1.preheader, label %for.cond.for.end14_crit_edge
+
+for.cond.for.end14_crit_edge:
+  br label %for.end14
+
+for.end14:
+  ret i32 0
+}
Index: test/ScheduleOptimizer/loop-fusion-stats-03.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/loop-fusion-stats-03.ll
@@ -0,0 +1,126 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s
+;
+; // Original code
+; int A[100], B[100];
+; int fusion(int c, int d, int n) {
+;   for (int j = 0; j < n; j++)
+;     A[j] ^= c;
+;   for (int i = 0; i < n; i++)
+;     A[i] |= d;
+;   for (int j = 0; j < n; j++)
+;     B[j] ^= c;
+;   for (int i = 0; i < n; i++)
+;     B[i] |= d;
+;   return 0;
+; }
+;
+; // AST of fused loops
+; for (int c1 = 0; c1 < n; c1 += 1) {
+;   Stmt_for_body1(c1);
+;   Stmt_for_body2(c1);
+; }
+; for (int c1 = 0; c1 < n; c1 += 1) {
+;   Stmt_for_body3(c1);
+;   Stmt_for_body4(c1);
+; }
+;
+; // Loop fusion debugging info
+; CHECK: Loop fusion 1: 2 loops fused at depth 1
+; CHECK: Loop fusion 2: 2 loops fused at depth 1
+;
+; // Loop fusion statistics
+; CHECK: 4 polly-opt-isl {{.*}} Number of fused loops
+; CHECK: 2 polly-opt-isl {{.*}} Number of loop fusions
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@A = common global [100 x i32] zeroinitializer, align 4
+@B = common global [100 x i32] zeroinitializer, align 4
+
+define i32 @fusion(i32 %c, i32 %d, i32 %n) #0 {
+entry:
+  br label %entry.split
+
+entry.split:
+  %cmp32 = icmp sgt i32 %n, 0
+  br i1 %cmp32, label %for.body.lr.ph, label %for.cond1.preheader
+
+for.body.lr.ph:
+  br label %for.body
+
+for.cond.for.cond1.preheader_crit_edge:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %cmp230 = icmp sgt i32 %n, 0
+  br i1 %cmp230, label %for.body3.lr.ph, label %for.cond9.preheader
+
+for.body3.lr.ph:
+  br label %for.body3
+
+for.body:
+  %j.033 = phi i32 [ 0, %for.body.lr.ph ], [ %1, %for.body ]
+  %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %j.033
+  %0 = load i32* %arrayidx, align 4
+  %xor = xor i32 %0, %c
+  store i32 %xor, i32* %arrayidx, align 4
+  %1 = add nsw i32 %j.033, 1
+  %exitcond36 = icmp ne i32 %1, %n
+  br i1 %exitcond36, label %for.body, label %for.cond.for.cond1.preheader_crit_edge
+
+for.cond1.for.cond9.preheader_crit_edge:
+  br label %for.cond9.preheader
+
+for.cond9.preheader:
+  %cmp1028 = icmp sgt i32 %n, 0
+  br i1 %cmp1028, label %for.body11.lr.ph, label %for.cond18.preheader
+
+for.body11.lr.ph:
+  br label %for.body11
+
+for.body3:
+  %i.031 = phi i32 [ 0, %for.body3.lr.ph ], [ %3, %for.body3 ]
+  %arrayidx4 = getelementptr [100 x i32]* @A, i32 0, i32 %i.031
+  %2 = load i32* %arrayidx4, align 4
+  %or = or i32 %2, %d
+  store i32 %or, i32* %arrayidx4, align 4
+  %3 = add nsw i32 %i.031, 1
+  %exitcond35 = icmp ne i32 %3, %n
+  br i1 %exitcond35, label %for.body3, label %for.cond1.for.cond9.preheader_crit_edge
+
+for.cond9.for.cond18.preheader_crit_edge:
+  br label %for.cond18.preheader
+
+for.cond18.preheader:
+  %cmp1926 = icmp sgt i32 %n, 0
+  br i1 %cmp1926, label %for.body20.lr.ph, label %for.end25
+
+for.body20.lr.ph:
+  br label %for.body20
+
+for.body11:
+  %j8.029 = phi i32 [ 0, %for.body11.lr.ph ], [ %5, %for.body11 ]
+  %arrayidx12 = getelementptr [100 x i32]* @B, i32 0, i32 %j8.029
+  %4 = load i32* %arrayidx12, align 4
+  %xor13 = xor i32 %4, %c
+  store i32 %xor13, i32* %arrayidx12, align 4
+  %5 = add nsw i32 %j8.029, 1
+  %exitcond34 = icmp ne i32 %5, %n
+  br i1 %exitcond34, label %for.body11, label %for.cond9.for.cond18.preheader_crit_edge
+
+for.body20:
+  %i17.027 = phi i32 [ 0, %for.body20.lr.ph ], [ %7, %for.body20 ]
+  %arrayidx21 = getelementptr [100 x i32]* @B, i32 0, i32 %i17.027
+  %6 = load i32* %arrayidx21, align 4
+  %or22 = or i32 %6, %d
+  store i32 %or22, i32* %arrayidx21, align 4
+  %7 = add nsw i32 %i17.027, 1
+  %exitcond = icmp ne i32 %7, %n
+  br i1 %exitcond, label %for.body20, label %for.cond18.for.end25_crit_edge
+
+for.cond18.for.end25_crit_edge:
+  br label %for.end25
+
+for.end25:
+  ret i32 0
+}
Index: test/ScheduleOptimizer/loop-fusion-stats-04.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/loop-fusion-stats-04.ll
@@ -0,0 +1,150 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s
+;
+; // Original code
+; int A[100], B[100];
+; int fusion(int c, int d, int m, int n) {
+;   for (int k = 0; k < m; k++) {
+;     for (int j = 0; j < n; j++)
+;       A[k + j] ^= c;
+;     for (int i = 0; i < n; i++)
+;       A[k + i] |= d;
+;     for (int j = 0; j < n; j++)
+;       B[k + j] ^= c;
+;     for (int i = 0; i < n; i++)
+;       B[k + i] |= d;
+;   }
+;   return 0;
+; }
+;
+; // AST of fused loops
+; for (int c1 = 0; c1 < m + n - 1; c1 += 1)
+;   for (int c2 = max(-n + 1, -c1); c2 <= min(0, m - c1 - 1); c2 += 1) {
+;     Stmt_for_body1(c1 + c2, -c2);
+;     Stmt_for_body2(c1 + c2, -c2);
+;   }
+; for (int c1 = 0; c1 < m + n - 1; c1 += 1)
+;   for (int c2 = max(-n + 1, -c1); c2 <= min(0, m - c1 - 1); c2 += 1) {
+;     Stmt_for_body3(c1 + c2, -c2);
+;     Stmt_for_body4(c1 + c2, -c2);
+;   }
+;
+; // Loop fusion debugging info
+; CHECK: Loop fusion 1: 2 loops fused at depth 2
+; CHECK: Loop fusion 2: 2 loops fused at depth 2
+;
+; // Loop fusion statistics
+; CHECK: 4 polly-opt-isl {{.*}} Number of fused loops
+; CHECK: 2 polly-opt-isl {{.*}} Number of loop fusions
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@A = common global [100 x i32] zeroinitializer, align 4
+@B = common global [100 x i32] zeroinitializer, align 4
+
+define i32 @fusion(i32 %c, i32 %d, i32 %m, i32 %n) #0 {
+entry:
+  br label %entry.split
+
+entry.split:
+  %cmp43 = icmp sgt i32 %m, 0
+  br i1 %cmp43, label %for.cond1.preheader.lr.ph, label %for.end34
+
+for.cond1.preheader.lr.ph:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %k.044 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %12, %for.inc32 ]
+  %cmp235 = icmp sgt i32 %n, 0
+  br i1 %cmp235, label %for.body3.lr.ph, label %for.cond4.preheader
+
+for.body3.lr.ph:
+  br label %for.body3
+
+for.cond1.for.cond4.preheader_crit_edge:
+  br label %for.cond4.preheader
+
+for.cond4.preheader:
+  %cmp537 = icmp sgt i32 %n, 0
+  br i1 %cmp537, label %for.body6.lr.ph, label %for.cond13.preheader
+
+for.body6.lr.ph:
+  br label %for.body6
+
+for.body3:
+  %j.036 = phi i32 [ 0, %for.body3.lr.ph ], [ %2, %for.body3 ]
+  %0 = add i32 %k.044, %j.036
+  %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %0
+  %1 = load i32* %arrayidx, align 4
+  %xor = xor i32 %1, %c
+  store i32 %xor, i32* %arrayidx, align 4
+  %2 = add nsw i32 %j.036, 1
+  %exitcond = icmp ne i32 %2, %n
+  br i1 %exitcond, label %for.body3, label %for.cond1.for.cond4.preheader_crit_edge
+
+for.cond4.for.cond13.preheader_crit_edge:
+  br label %for.cond13.preheader
+
+for.cond13.preheader:
+  %cmp1439 = icmp sgt i32 %n, 0
+  br i1 %cmp1439, label %for.body15.lr.ph, label %for.cond23.preheader
+
+for.body15.lr.ph:
+  br label %for.body15
+
+for.body6:
+  %i.038 = phi i32 [ 0, %for.body6.lr.ph ], [ %5, %for.body6 ]
+  %3 = add i32 %k.044, %i.038
+  %arrayidx8 = getelementptr [100 x i32]* @A, i32 0, i32 %3
+  %4 = load i32* %arrayidx8, align 4
+  %or = or i32 %4, %d
+  store i32 %or, i32* %arrayidx8, align 4
+  %5 = add nsw i32 %i.038, 1
+  %exitcond46 = icmp ne i32 %5, %n
+  br i1 %exitcond46, label %for.body6, label %for.cond4.for.cond13.preheader_crit_edge
+
+for.cond13.for.cond23.preheader_crit_edge:
+  br label %for.cond23.preheader
+
+for.cond23.preheader:
+  %cmp2441 = icmp sgt i32 %n, 0
+  br i1 %cmp2441, label %for.body25.lr.ph, label %for.inc32
+
+for.body25.lr.ph:
+  br label %for.body25
+
+for.body15:
+  %j12.040 = phi i32 [ 0, %for.body15.lr.ph ], [ %8, %for.body15 ]
+  %6 = add i32 %k.044, %j12.040
+  %arrayidx17 = getelementptr [100 x i32]* @B, i32 0, i32 %6
+  %7 = load i32* %arrayidx17, align 4
+  %xor18 = xor i32 %7, %c
+  store i32 %xor18, i32* %arrayidx17, align 4
+  %8 = add nsw i32 %j12.040, 1
+  %exitcond47 = icmp ne i32 %8, %n
+  br i1 %exitcond47, label %for.body15, label %for.cond13.for.cond23.preheader_crit_edge
+
+for.body25:
+  %i22.042 = phi i32 [ 0, %for.body25.lr.ph ], [ %11, %for.body25 ]
+  %9 = add i32 %k.044, %i22.042
+  %arrayidx27 = getelementptr [100 x i32]* @B, i32 0, i32 %9
+  %10 = load i32* %arrayidx27, align 4
+  %or28 = or i32 %10, %d
+  store i32 %or28, i32* %arrayidx27, align 4
+  %11 = add nsw i32 %i22.042, 1
+  %exitcond48 = icmp ne i32 %11, %n
+  br i1 %exitcond48, label %for.body25, label %for.cond23.for.inc32_crit_edge
+
+for.cond23.for.inc32_crit_edge:
+  br label %for.inc32
+
+for.inc32:
+  %12 = add nsw i32 %k.044, 1
+  %exitcond49 = icmp ne i32 %12, %m
+  br i1 %exitcond49, label %for.cond1.preheader, label %for.cond.for.end34_crit_edge
+
+for.cond.for.end34_crit_edge:
+  br label %for.end34
+
+for.end34:
+  ret i32 0
+}
Index: test/ScheduleOptimizer/loop-fusion-stats-05.ll
===================================================================
--- /dev/null
+++ test/ScheduleOptimizer/loop-fusion-stats-05.ll
@@ -0,0 +1,113 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s
+;
+; // Original code
+; int A[100], B[100];
+; int fusion(int c, int d, int m, int n) {
+;   for (int k = 0; k < m; k++)
+;     for (int j = 0; j < n; j++)
+;       A[j] ^= c;
+;   for (int k = 0; k < m; k++)
+;     for (int i = 0; i < n; i++)
+;       A[i] |= d;
+;   return 0;
+; }
+;
+; // AST of fused loops
+; for (int c0 = 0; c0 < n; c0 += 1) {
+;   for (int c1 = 0; c1 < m; c1 += 1)
+;     Stmt_for_body1(c1, c0);
+;   for (int c1 = m; c1 < 2 * m; c1 += 1)
+;     Stmt_for_body2(-m + c1, c0);
+; }
+;
+; // Loop fusion debugging info
+; CHECK: Loop fusion 1: 2 loops fused at depth 2
+;
+; // Loop fusion statistics
+; CHECK: 2 polly-opt-isl {{.*}} Number of fused loops
+; CHECK: 1 polly-opt-isl {{.*}} Number of loop fusions
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@A = common global [100 x i32] zeroinitializer, align 4
+@B = common global [100 x i32] zeroinitializer, align 4
+
+define i32 @fusion(i32 %c, i32 %d, i32 %m, i32 %n) #0 {
+entry:
+  br label %entry.split
+
+entry.split:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.cond8.preheader
+
+for.cond1.preheader.lr.ph:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %k.028 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %2, %for.inc4 ]
+  %cmp225 = icmp sgt i32 %n, 0
+  br i1 %cmp225, label %for.body3.lr.ph, label %for.inc4
+
+for.body3.lr.ph:
+  br label %for.body3
+
+for.cond.for.cond8.preheader_crit_edge:
+  br label %for.cond8.preheader
+
+for.cond8.preheader:
+  %cmp923 = icmp sgt i32 %m, 0
+  br i1 %cmp923, label %for.cond11.preheader.lr.ph, label %for.end20
+
+for.cond11.preheader.lr.ph:
+  br label %for.cond11.preheader
+
+for.body3:
+  %j.026 = phi i32 [ 0, %for.body3.lr.ph ], [ %1, %for.body3 ]
+  %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %j.026
+  %0 = load i32* %arrayidx, align 4
+  %xor = xor i32 %0, %c
+  store i32 %xor, i32* %arrayidx, align 4
+  %1 = add nsw i32 %j.026, 1
+  %exitcond30 = icmp ne i32 %1, %n
+  br i1 %exitcond30, label %for.body3, label %for.cond1.for.inc4_crit_edge
+
+for.cond1.for.inc4_crit_edge:
+  br label %for.inc4
+
+for.inc4:
+  %2 = add nsw i32 %k.028, 1
+  %exitcond31 = icmp ne i32 %2, %m
+  br i1 %exitcond31, label %for.cond1.preheader, label %for.cond.for.cond8.preheader_crit_edge
+
+for.cond11.preheader:
+  %k7.024 = phi i32 [ 0, %for.cond11.preheader.lr.ph ], [ %5, %for.inc18 ]
+  %cmp1221 = icmp sgt i32 %n, 0
+  br i1 %cmp1221, label %for.body13.lr.ph, label %for.inc18
+
+for.body13.lr.ph:
+  br label %for.body13
+
+for.body13:
+  %i.022 = phi i32 [ 0, %for.body13.lr.ph ], [ %4, %for.body13 ]
+  %arrayidx14 = getelementptr [100 x i32]* @A, i32 0, i32 %i.022
+  %3 = load i32* %arrayidx14, align 4
+  %or = or i32 %3, %d
+  store i32 %or, i32* %arrayidx14, align 4
+  %4 = add nsw i32 %i.022, 1
+  %exitcond = icmp ne i32 %4, %n
+  br i1 %exitcond, label %for.body13, label %for.cond11.for.inc18_crit_edge
+
+for.cond11.for.inc18_crit_edge:
+  br label %for.inc18
+
+for.inc18:
+  %5 = add nsw i32 %k7.024, 1
+  %exitcond29 = icmp ne i32 %5, %m
+  br i1 %exitcond29, label %for.cond11.preheader, label %for.cond8.for.end20_crit_edge
+
+for.cond8.for.end20_crit_edge:
+  br label %for.end20
+
+for.end20:
+  ret i32 0
+}