Index: lib/Transform/ScheduleOptimizer.cpp =================================================================== --- lib/Transform/ScheduleOptimizer.cpp +++ lib/Transform/ScheduleOptimizer.cpp @@ -37,6 +37,9 @@ #define DEBUG_TYPE "polly-opt-isl" +STATISTIC(NumFusedLoops, "Number of fused loops"); +STATISTIC(NumLoopFusions, "Number of loop fusions"); + namespace polly { bool DisablePollyTiling; } @@ -194,6 +197,18 @@ static isl_union_map *getScheduleMap(isl_schedule *Schedule); + /// @brief Report loop fusion statistics. + void reportFusedLoops(Scop &S, unsigned Dims); + + /// @brief Determine if two statements are scheduled together. + bool doStmtsShareScatteringDim(ScopStmt *Stmt1, ScopStmt *Stmt2, unsigned D); + + /// @brief Determine if a scattering dimension involves input dimensions. + bool doesScatteringInvolveInputDims(ScopStmt *Stmt, unsigned D); + + /// @brief Get the outermost loop dimension for a given schedule dimension. + unsigned getOutermostLoopDim(ScopStmt *Stmt, unsigned D); + using llvm::Pass::doFinalization; virtual bool doFinalization() { @@ -585,9 +600,146 @@ MaxScatDims = std::max(Stmt->getNumScattering(), MaxScatDims); extendScattering(S, MaxScatDims); + + if (AreStatisticsEnabled() && FusionStrategy == "max") + reportFusedLoops(S, MaxScatDims); + return false; } +bool IslScheduleOptimizer::doesScatteringInvolveInputDims(ScopStmt *Stmt, + unsigned D) { + + isl_pw_multi_aff *ScatPMA; + isl_pw_aff *ScatPA; + + ScatPMA = isl_pw_multi_aff_from_map(Stmt->getScattering()); + ScatPMA = isl_pw_multi_aff_gist(ScatPMA, Stmt->getDomain()); + ScatPA = isl_pw_multi_aff_get_pw_aff(ScatPMA, D); + + bool Iterable = !isl_pw_aff_is_cst(ScatPA); + + isl_pw_multi_aff_free(ScatPMA); + isl_pw_aff_free(ScatPA); + return Iterable; +} + +unsigned IslScheduleOptimizer::getOutermostLoopDim(ScopStmt *Stmt, unsigned D) { + + isl_pw_multi_aff *ScatPMA; + isl_pw_aff *ScatPA; + + ScatPMA = isl_pw_multi_aff_from_map(Stmt->getScattering()); + ScatPMA = isl_pw_multi_aff_gist(ScatPMA, Stmt->getDomain()); + ScatPA = isl_pw_multi_aff_get_pw_aff(ScatPMA, D); + + unsigned Dimension = -1; + + for (unsigned i = 0, e = Stmt->getNumIterators(); i != e; ++i) + if (isl_pw_aff_involves_dims(ScatPA, isl_dim_in, i, 1)) + if (Stmt->getLoopForDimension(i)) { + Dimension = i; + break; + } + + isl_pw_multi_aff_free(ScatPMA); + isl_pw_aff_free(ScatPA); + return Dimension; +} + +bool IslScheduleOptimizer::doStmtsShareScatteringDim(ScopStmt *Stmt1, ScopStmt + *Stmt2, unsigned D) { + + isl_set *S1Sched; + isl_set *S2Sched; + + S1Sched = isl_map_range(Stmt1->getScattering()); + S2Sched = isl_map_range(Stmt2->getScattering()); + + unsigned Dims = isl_set_dim(S1Sched, isl_dim_set); + assert(isl_set_dim(S2Sched, isl_dim_set) == Dims && "Schedule mismatch"); + + S1Sched = isl_set_project_out(S1Sched, isl_dim_set, D + 1, Dims - D - 1); + S2Sched = isl_set_project_out(S2Sched, isl_dim_set, D + 1, Dims - D - 1); + + bool Fused = isl_set_plain_is_equal(S1Sched, S2Sched); + + isl_set_free(S1Sched); + isl_set_free(S2Sched); + return Fused; +} + +void IslScheduleOptimizer::reportFusedLoops(Scop &S, unsigned MaxScatDims) { + + // Collect all the SCoP statements that may be fused. + SmallVector Stmts; + for (ScopStmt *Stmt : S) + Stmts.push_back(Stmt); + + // If we only have one statement, there is no fusion. + if (Stmts.size() < 2) + return; + + DEBUG(dbgs() << "\n"); + + // Keep track of all the loops that are fused in FusedLoops. + SmallPtrSet FusedLoops; + + // Iterate over the scattering dimensions. All statements should have the + // same number of scattering dimensions. + SmallVectorImpl::iterator Begin = Stmts.begin(), E = Stmts.end(); + for (unsigned D = 0; D < MaxScatDims; ++D) { + + // Ignore auxiliary scattering dimensions that do not involve the input + // dimensions. Constant dimensions are only used for statement ordering. + if (!doesScatteringInvolveInputDims(*Begin, D)) + continue; + + // Iterate over all statements I and perform comparisons for fusion. + for (SmallVectorImpl::iterator I = Begin; I != E - 1; ++I) { + + // Get the original loop corresponding to statement I. Note that this is + // not an exact mapping due to transformations that create additional + // scattering dimensions such as tiling. To work around this, we get the + // outermost original loop the current scattering dimension involves. + unsigned LoopDimension = getOutermostLoopDim(*I, D); + assert(LoopDimension >= 0 && "Expected positive loop dimension"); + + const Loop *LoopI = (*I)->getLoopForDimension(LoopDimension); + + // If LoopI is in FusedLoops, it has already been counted as fused, so + // there is no need to consider it again. + if (FusedLoops.count(LoopI)) + continue; + + // Keep track of all loops fused with LoopI in LoopsJFusedWithI. + SmallPtrSet LoopsJFusedWithI; + LoopsJFusedWithI.insert(LoopI); + + // Determine if LoopI is fused with any other other in the SCoP. Two + // loops are fused if they are not equal and their corresponding + // statements share the same scattering dimension. + for (SmallVectorImpl::iterator J = I + 1; J != E; ++J) + if (doStmtsShareScatteringDim(*I, *J, D)) + LoopsJFusedWithI.insert((*J)->getLoopForDimension(LoopDimension)); + + // If LoopsJFusedWithI contains more than one loop, they are all fused in + // the current LoopDimension. Record loop fusion statistics. + if (LoopsJFusedWithI.size() > 1) { + ++NumLoopFusions; + NumFusedLoops += LoopsJFusedWithI.size(); + DEBUG(dbgs() << "Loop fusion " << NumLoopFusions << ": " + << LoopsJFusedWithI.size() << " loops fused at depth " + << LoopDimension + 1 << "\n"); + + // Save the fused loops to avoid extra computation. + FusedLoops.insert(LoopsJFusedWithI.begin(), LoopsJFusedWithI.end()); + } + } + } + DEBUG(dbgs() << "\n"); +} + void IslScheduleOptimizer::printScop(raw_ostream &OS) const { isl_printer *p; char *ScheduleStr; Index: test/ScheduleOptimizer/loop-fusion-stats-01.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/loop-fusion-stats-01.ll @@ -0,0 +1,76 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s +; +; // Original code +; int A[100]; +; int fusion(int c, int d, int n) { +; for (int j = 0; j < n; j++) +; A[j] ^= c; +; for (int i = 0; i < n; i++) +; A[i] |= d; +; return 0; +; } +; +; // AST of fused loops +; for (int c0 = 0; c0 < n; c0 += 1) { +; Stmt_for_body1(c0); +; Stmt_for_body2(c0); +; } +; +; // Loop fusion debugging info +; CHECK: Loop fusion 1: 2 loops fused at depth 1 +; +; // Loop fusion statistics +; CHECK: 2 polly-opt-isl {{.*}} Number of fused loops +; CHECK: 1 polly-opt-isl {{.*}} Number of loop fusions + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +@A = common global [100 x i32] zeroinitializer, align 4 + +define i32 @fusion(i32 %c, i32 %d, i32 %n) #0 { +entry: + br label %entry.split + +entry.split: + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %for.body.lr.ph, label %for.cond1.preheader + +for.body.lr.ph: + br label %for.body + +for.cond.for.cond1.preheader_crit_edge: + br label %for.cond1.preheader + +for.cond1.preheader: + %cmp28 = icmp sgt i32 %n, 0 + br i1 %cmp28, label %for.body3.lr.ph, label %for.end7 + +for.body3.lr.ph: + br label %for.body3 + +for.body: + %j.011 = phi i32 [ 0, %for.body.lr.ph ], [ %1, %for.body ] + %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %j.011 + %0 = load i32* %arrayidx, align 4 + %xor = xor i32 %0, %c + store i32 %xor, i32* %arrayidx, align 4 + %1 = add nsw i32 %j.011, 1 + %exitcond12 = icmp ne i32 %1, %n + br i1 %exitcond12, label %for.body, label %for.cond.for.cond1.preheader_crit_edge + +for.body3: + %i.09 = phi i32 [ 0, %for.body3.lr.ph ], [ %3, %for.body3 ] + %arrayidx4 = getelementptr [100 x i32]* @A, i32 0, i32 %i.09 + %2 = load i32* %arrayidx4, align 4 + %or = or i32 %2, %d + store i32 %or, i32* %arrayidx4, align 4 + %3 = add nsw i32 %i.09, 1 + %exitcond = icmp ne i32 %3, %n + br i1 %exitcond, label %for.body3, label %for.cond1.for.end7_crit_edge + +for.cond1.for.end7_crit_edge: + br label %for.end7 + +for.end7: + ret i32 0 +} Index: test/ScheduleOptimizer/loop-fusion-stats-02.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/loop-fusion-stats-02.ll @@ -0,0 +1,97 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s +; +; // Original code +; int A[100]; +; int fusion(int c, int d, int m, int n) { +; for (int k = 0; k < m; k++) { +; for (int j = 0; j < n; j++) +; A[k + j] ^= c; +; for (int i = 0; i < n; i++) +; A[k + i] |= d; +; } +; return 0; +; } +; +; // AST of fused loops +; for (int c0 = 0; c0 < m + n - 1; c0 += 1) +; for (int c1 = max(-n + 1, -c0); c1 <= min(0, m - c0 - 1); c1 += 1) { +; Stmt_for_body1(c0 + c1, -c1); +; Stmt_for_body2(c0 + c1, -c1); +; } +; +; // Loop fusion debugging info +; CHECK: Loop fusion 1: 2 loops fused at depth 2 +; +; // Loop fusion statistics +; CHECK: 2 polly-opt-isl {{.*}} Number of fused loops +; CHECK: 1 polly-opt-isl {{.*}} Number of loop fusions + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +@A = common global [100 x i32] zeroinitializer, align 4 + +define i32 @fusion(i32 %c, i32 %d, i32 %m, i32 %n) #0 { +entry: + br label %entry.split + +entry.split: + %cmp19 = icmp sgt i32 %m, 0 + br i1 %cmp19, label %for.cond1.preheader.lr.ph, label %for.end14 + +for.cond1.preheader.lr.ph: + br label %for.cond1.preheader + +for.cond1.preheader: + %k.020 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %6, %for.inc12 ] + %cmp215 = icmp sgt i32 %n, 0 + br i1 %cmp215, label %for.body3.lr.ph, label %for.cond4.preheader + +for.body3.lr.ph: + br label %for.body3 + +for.cond1.for.cond4.preheader_crit_edge: + br label %for.cond4.preheader + +for.cond4.preheader: + %cmp517 = icmp sgt i32 %n, 0 + br i1 %cmp517, label %for.body6.lr.ph, label %for.inc12 + +for.body6.lr.ph: + br label %for.body6 + +for.body3: + %j.016 = phi i32 [ 0, %for.body3.lr.ph ], [ %2, %for.body3 ] + %0 = add i32 %k.020, %j.016 + %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %0 + %1 = load i32* %arrayidx, align 4 + %xor = xor i32 %1, %c + store i32 %xor, i32* %arrayidx, align 4 + %2 = add nsw i32 %j.016, 1 + %exitcond = icmp ne i32 %2, %n + br i1 %exitcond, label %for.body3, label %for.cond1.for.cond4.preheader_crit_edge + +for.body6: + %i.018 = phi i32 [ 0, %for.body6.lr.ph ], [ %5, %for.body6 ] + %3 = add i32 %k.020, %i.018 + %arrayidx8 = getelementptr [100 x i32]* @A, i32 0, i32 %3 + %4 = load i32* %arrayidx8, align 4 + %or = or i32 %4, %d + store i32 %or, i32* %arrayidx8, align 4 + %5 = add nsw i32 %i.018, 1 + %exitcond22 = icmp ne i32 %5, %n + br i1 %exitcond22, label %for.body6, label %for.cond4.for.inc12_crit_edge + +for.cond4.for.inc12_crit_edge: + br label %for.inc12 + +for.inc12: + %6 = add nsw i32 %k.020, 1 + %exitcond23 = icmp ne i32 %6, %m + br i1 %exitcond23, label %for.cond1.preheader, label %for.cond.for.end14_crit_edge + +for.cond.for.end14_crit_edge: + br label %for.end14 + +for.end14: + ret i32 0 +} Index: test/ScheduleOptimizer/loop-fusion-stats-03.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/loop-fusion-stats-03.ll @@ -0,0 +1,126 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s +; +; // Original code +; int A[100], B[100]; +; int fusion(int c, int d, int n) { +; for (int j = 0; j < n; j++) +; A[j] ^= c; +; for (int i = 0; i < n; i++) +; A[i] |= d; +; for (int j = 0; j < n; j++) +; B[j] ^= c; +; for (int i = 0; i < n; i++) +; B[i] |= d; +; return 0; +; } +; +; // AST of fused loops +; for (int c1 = 0; c1 < n; c1 += 1) { +; Stmt_for_body1(c1); +; Stmt_for_body2(c1); +; } +; for (int c1 = 0; c1 < n; c1 += 1) { +; Stmt_for_body3(c1); +; Stmt_for_body4(c1); +; } +; +; // Loop fusion debugging info +; CHECK: Loop fusion 1: 2 loops fused at depth 1 +; CHECK: Loop fusion 2: 2 loops fused at depth 1 +; +; // Loop fusion statistics +; CHECK: 4 polly-opt-isl {{.*}} Number of fused loops +; CHECK: 2 polly-opt-isl {{.*}} Number of loop fusions + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +@A = common global [100 x i32] zeroinitializer, align 4 +@B = common global [100 x i32] zeroinitializer, align 4 + +define i32 @fusion(i32 %c, i32 %d, i32 %n) #0 { +entry: + br label %entry.split + +entry.split: + %cmp32 = icmp sgt i32 %n, 0 + br i1 %cmp32, label %for.body.lr.ph, label %for.cond1.preheader + +for.body.lr.ph: + br label %for.body + +for.cond.for.cond1.preheader_crit_edge: + br label %for.cond1.preheader + +for.cond1.preheader: + %cmp230 = icmp sgt i32 %n, 0 + br i1 %cmp230, label %for.body3.lr.ph, label %for.cond9.preheader + +for.body3.lr.ph: + br label %for.body3 + +for.body: + %j.033 = phi i32 [ 0, %for.body.lr.ph ], [ %1, %for.body ] + %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %j.033 + %0 = load i32* %arrayidx, align 4 + %xor = xor i32 %0, %c + store i32 %xor, i32* %arrayidx, align 4 + %1 = add nsw i32 %j.033, 1 + %exitcond36 = icmp ne i32 %1, %n + br i1 %exitcond36, label %for.body, label %for.cond.for.cond1.preheader_crit_edge + +for.cond1.for.cond9.preheader_crit_edge: + br label %for.cond9.preheader + +for.cond9.preheader: + %cmp1028 = icmp sgt i32 %n, 0 + br i1 %cmp1028, label %for.body11.lr.ph, label %for.cond18.preheader + +for.body11.lr.ph: + br label %for.body11 + +for.body3: + %i.031 = phi i32 [ 0, %for.body3.lr.ph ], [ %3, %for.body3 ] + %arrayidx4 = getelementptr [100 x i32]* @A, i32 0, i32 %i.031 + %2 = load i32* %arrayidx4, align 4 + %or = or i32 %2, %d + store i32 %or, i32* %arrayidx4, align 4 + %3 = add nsw i32 %i.031, 1 + %exitcond35 = icmp ne i32 %3, %n + br i1 %exitcond35, label %for.body3, label %for.cond1.for.cond9.preheader_crit_edge + +for.cond9.for.cond18.preheader_crit_edge: + br label %for.cond18.preheader + +for.cond18.preheader: + %cmp1926 = icmp sgt i32 %n, 0 + br i1 %cmp1926, label %for.body20.lr.ph, label %for.end25 + +for.body20.lr.ph: + br label %for.body20 + +for.body11: + %j8.029 = phi i32 [ 0, %for.body11.lr.ph ], [ %5, %for.body11 ] + %arrayidx12 = getelementptr [100 x i32]* @B, i32 0, i32 %j8.029 + %4 = load i32* %arrayidx12, align 4 + %xor13 = xor i32 %4, %c + store i32 %xor13, i32* %arrayidx12, align 4 + %5 = add nsw i32 %j8.029, 1 + %exitcond34 = icmp ne i32 %5, %n + br i1 %exitcond34, label %for.body11, label %for.cond9.for.cond18.preheader_crit_edge + +for.body20: + %i17.027 = phi i32 [ 0, %for.body20.lr.ph ], [ %7, %for.body20 ] + %arrayidx21 = getelementptr [100 x i32]* @B, i32 0, i32 %i17.027 + %6 = load i32* %arrayidx21, align 4 + %or22 = or i32 %6, %d + store i32 %or22, i32* %arrayidx21, align 4 + %7 = add nsw i32 %i17.027, 1 + %exitcond = icmp ne i32 %7, %n + br i1 %exitcond, label %for.body20, label %for.cond18.for.end25_crit_edge + +for.cond18.for.end25_crit_edge: + br label %for.end25 + +for.end25: + ret i32 0 +} Index: test/ScheduleOptimizer/loop-fusion-stats-04.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/loop-fusion-stats-04.ll @@ -0,0 +1,150 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s +; +; // Original code +; int A[100], B[100]; +; int fusion(int c, int d, int m, int n) { +; for (int k = 0; k < m; k++) { +; for (int j = 0; j < n; j++) +; A[k + j] ^= c; +; for (int i = 0; i < n; i++) +; A[k + i] |= d; +; for (int j = 0; j < n; j++) +; B[k + j] ^= c; +; for (int i = 0; i < n; i++) +; B[k + i] |= d; +; } +; return 0; +; } +; +; // AST of fused loops +; for (int c1 = 0; c1 < m + n - 1; c1 += 1) +; for (int c2 = max(-n + 1, -c1); c2 <= min(0, m - c1 - 1); c2 += 1) { +; Stmt_for_body1(c1 + c2, -c2); +; Stmt_for_body2(c1 + c2, -c2); +; } +; for (int c1 = 0; c1 < m + n - 1; c1 += 1) +; for (int c2 = max(-n + 1, -c1); c2 <= min(0, m - c1 - 1); c2 += 1) { +; Stmt_for_body3(c1 + c2, -c2); +; Stmt_for_body4(c1 + c2, -c2); +; } +; +; // Loop fusion debugging info +; CHECK: Loop fusion 1: 2 loops fused at depth 2 +; CHECK: Loop fusion 2: 2 loops fused at depth 2 +; +; // Loop fusion statistics +; CHECK: 4 polly-opt-isl {{.*}} Number of fused loops +; CHECK: 2 polly-opt-isl {{.*}} Number of loop fusions + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +@A = common global [100 x i32] zeroinitializer, align 4 +@B = common global [100 x i32] zeroinitializer, align 4 + +define i32 @fusion(i32 %c, i32 %d, i32 %m, i32 %n) #0 { +entry: + br label %entry.split + +entry.split: + %cmp43 = icmp sgt i32 %m, 0 + br i1 %cmp43, label %for.cond1.preheader.lr.ph, label %for.end34 + +for.cond1.preheader.lr.ph: + br label %for.cond1.preheader + +for.cond1.preheader: + %k.044 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %12, %for.inc32 ] + %cmp235 = icmp sgt i32 %n, 0 + br i1 %cmp235, label %for.body3.lr.ph, label %for.cond4.preheader + +for.body3.lr.ph: + br label %for.body3 + +for.cond1.for.cond4.preheader_crit_edge: + br label %for.cond4.preheader + +for.cond4.preheader: + %cmp537 = icmp sgt i32 %n, 0 + br i1 %cmp537, label %for.body6.lr.ph, label %for.cond13.preheader + +for.body6.lr.ph: + br label %for.body6 + +for.body3: + %j.036 = phi i32 [ 0, %for.body3.lr.ph ], [ %2, %for.body3 ] + %0 = add i32 %k.044, %j.036 + %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %0 + %1 = load i32* %arrayidx, align 4 + %xor = xor i32 %1, %c + store i32 %xor, i32* %arrayidx, align 4 + %2 = add nsw i32 %j.036, 1 + %exitcond = icmp ne i32 %2, %n + br i1 %exitcond, label %for.body3, label %for.cond1.for.cond4.preheader_crit_edge + +for.cond4.for.cond13.preheader_crit_edge: + br label %for.cond13.preheader + +for.cond13.preheader: + %cmp1439 = icmp sgt i32 %n, 0 + br i1 %cmp1439, label %for.body15.lr.ph, label %for.cond23.preheader + +for.body15.lr.ph: + br label %for.body15 + +for.body6: + %i.038 = phi i32 [ 0, %for.body6.lr.ph ], [ %5, %for.body6 ] + %3 = add i32 %k.044, %i.038 + %arrayidx8 = getelementptr [100 x i32]* @A, i32 0, i32 %3 + %4 = load i32* %arrayidx8, align 4 + %or = or i32 %4, %d + store i32 %or, i32* %arrayidx8, align 4 + %5 = add nsw i32 %i.038, 1 + %exitcond46 = icmp ne i32 %5, %n + br i1 %exitcond46, label %for.body6, label %for.cond4.for.cond13.preheader_crit_edge + +for.cond13.for.cond23.preheader_crit_edge: + br label %for.cond23.preheader + +for.cond23.preheader: + %cmp2441 = icmp sgt i32 %n, 0 + br i1 %cmp2441, label %for.body25.lr.ph, label %for.inc32 + +for.body25.lr.ph: + br label %for.body25 + +for.body15: + %j12.040 = phi i32 [ 0, %for.body15.lr.ph ], [ %8, %for.body15 ] + %6 = add i32 %k.044, %j12.040 + %arrayidx17 = getelementptr [100 x i32]* @B, i32 0, i32 %6 + %7 = load i32* %arrayidx17, align 4 + %xor18 = xor i32 %7, %c + store i32 %xor18, i32* %arrayidx17, align 4 + %8 = add nsw i32 %j12.040, 1 + %exitcond47 = icmp ne i32 %8, %n + br i1 %exitcond47, label %for.body15, label %for.cond13.for.cond23.preheader_crit_edge + +for.body25: + %i22.042 = phi i32 [ 0, %for.body25.lr.ph ], [ %11, %for.body25 ] + %9 = add i32 %k.044, %i22.042 + %arrayidx27 = getelementptr [100 x i32]* @B, i32 0, i32 %9 + %10 = load i32* %arrayidx27, align 4 + %or28 = or i32 %10, %d + store i32 %or28, i32* %arrayidx27, align 4 + %11 = add nsw i32 %i22.042, 1 + %exitcond48 = icmp ne i32 %11, %n + br i1 %exitcond48, label %for.body25, label %for.cond23.for.inc32_crit_edge + +for.cond23.for.inc32_crit_edge: + br label %for.inc32 + +for.inc32: + %12 = add nsw i32 %k.044, 1 + %exitcond49 = icmp ne i32 %12, %m + br i1 %exitcond49, label %for.cond1.preheader, label %for.cond.for.end34_crit_edge + +for.cond.for.end34_crit_edge: + br label %for.end34 + +for.end34: + ret i32 0 +} Index: test/ScheduleOptimizer/loop-fusion-stats-05.ll =================================================================== --- /dev/null +++ test/ScheduleOptimizer/loop-fusion-stats-05.ll @@ -0,0 +1,113 @@ +; RUN: opt %loadPolly -polly-opt-isl -polly-opt-fusion=max -debug-only=polly-opt-isl -stats -analyze < %s 2>&1 | FileCheck %s +; +; // Original code +; int A[100], B[100]; +; int fusion(int c, int d, int m, int n) { +; for (int k = 0; k < m; k++) +; for (int j = 0; j < n; j++) +; A[j] ^= c; +; for (int k = 0; k < m; k++) +; for (int i = 0; i < n; i++) +; A[i] |= d; +; return 0; +; } +; +; // AST of fused loops +; for (int c0 = 0; c0 < n; c0 += 1) { +; for (int c1 = 0; c1 < m; c1 += 1) +; Stmt_for_body1(c1, c0); +; for (int c1 = m; c1 < 2 * m; c1 += 1) +; Stmt_for_body2(-m + c1, c0); +; } +; +; // Loop fusion debugging info +; CHECK: Loop fusion 1: 2 loops fused at depth 2 +; +; // Loop fusion statistics +; CHECK: 2 polly-opt-isl {{.*}} Number of fused loops +; CHECK: 1 polly-opt-isl {{.*}} Number of loop fusions + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64" + +@A = common global [100 x i32] zeroinitializer, align 4 +@B = common global [100 x i32] zeroinitializer, align 4 + +define i32 @fusion(i32 %c, i32 %d, i32 %m, i32 %n) #0 { +entry: + br label %entry.split + +entry.split: + %cmp27 = icmp sgt i32 %m, 0 + br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.cond8.preheader + +for.cond1.preheader.lr.ph: + br label %for.cond1.preheader + +for.cond1.preheader: + %k.028 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %2, %for.inc4 ] + %cmp225 = icmp sgt i32 %n, 0 + br i1 %cmp225, label %for.body3.lr.ph, label %for.inc4 + +for.body3.lr.ph: + br label %for.body3 + +for.cond.for.cond8.preheader_crit_edge: + br label %for.cond8.preheader + +for.cond8.preheader: + %cmp923 = icmp sgt i32 %m, 0 + br i1 %cmp923, label %for.cond11.preheader.lr.ph, label %for.end20 + +for.cond11.preheader.lr.ph: + br label %for.cond11.preheader + +for.body3: + %j.026 = phi i32 [ 0, %for.body3.lr.ph ], [ %1, %for.body3 ] + %arrayidx = getelementptr [100 x i32]* @A, i32 0, i32 %j.026 + %0 = load i32* %arrayidx, align 4 + %xor = xor i32 %0, %c + store i32 %xor, i32* %arrayidx, align 4 + %1 = add nsw i32 %j.026, 1 + %exitcond30 = icmp ne i32 %1, %n + br i1 %exitcond30, label %for.body3, label %for.cond1.for.inc4_crit_edge + +for.cond1.for.inc4_crit_edge: + br label %for.inc4 + +for.inc4: + %2 = add nsw i32 %k.028, 1 + %exitcond31 = icmp ne i32 %2, %m + br i1 %exitcond31, label %for.cond1.preheader, label %for.cond.for.cond8.preheader_crit_edge + +for.cond11.preheader: + %k7.024 = phi i32 [ 0, %for.cond11.preheader.lr.ph ], [ %5, %for.inc18 ] + %cmp1221 = icmp sgt i32 %n, 0 + br i1 %cmp1221, label %for.body13.lr.ph, label %for.inc18 + +for.body13.lr.ph: + br label %for.body13 + +for.body13: + %i.022 = phi i32 [ 0, %for.body13.lr.ph ], [ %4, %for.body13 ] + %arrayidx14 = getelementptr [100 x i32]* @A, i32 0, i32 %i.022 + %3 = load i32* %arrayidx14, align 4 + %or = or i32 %3, %d + store i32 %or, i32* %arrayidx14, align 4 + %4 = add nsw i32 %i.022, 1 + %exitcond = icmp ne i32 %4, %n + br i1 %exitcond, label %for.body13, label %for.cond11.for.inc18_crit_edge + +for.cond11.for.inc18_crit_edge: + br label %for.inc18 + +for.inc18: + %5 = add nsw i32 %k7.024, 1 + %exitcond29 = icmp ne i32 %5, %m + br i1 %exitcond29, label %for.cond11.preheader, label %for.cond8.for.end20_crit_edge + +for.cond8.for.end20_crit_edge: + br label %for.end20 + +for.end20: + ret i32 0 +}