Index: lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- lib/Transforms/Scalar/LoopInterchange.cpp +++ lib/Transforms/Scalar/LoopInterchange.cpp @@ -351,7 +351,8 @@ bool containsUnsafeInstructionsInLatch(BasicBlock *BB); bool findInductionAndReductions(Loop *L, SmallVector &Inductions, - SmallVector &Reductions); + SmallVector &Reductions, + Loop *InnerLoop); Loop *OuterLoop; Loop *InnerLoop; @@ -713,9 +714,35 @@ return true; } +// If SV is a LCSSA PHI node with a single incoming value, return the incoming +// value. +static Value *followLCSSA(Value *SV) { + PHINode *PHI = dyn_cast(SV); + if (!PHI) + return SV; + + if (PHI->getNumIncomingValues() != 1) + return SV; + return followLCSSA(PHI->getIncomingValue(0)); +} + +// Check V's users to see if it is involved in a reduction in L. +static PHINode *findInnerReductionPhi(Loop *L, Value *V) { + for (Value *User : V->users()) { + if (PHINode *PHI = dyn_cast(User)) { + RecurrenceDescriptor RD; + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + return PHI; + return nullptr; + } + } + + return nullptr; +} + bool LoopInterchangeLegality::findInductionAndReductions( Loop *L, SmallVector &Inductions, - SmallVector &Reductions) { + SmallVector &Reductions, Loop *InnerLoop) { if (!L->getLoopLatch() || !L->getLoopPredecessor()) return false; for (PHINode &PHI : L->getHeader()->phis()) { @@ -725,7 +752,21 @@ Inductions.push_back(&PHI); else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) Reductions.push_back(&PHI); - else { + else if (InnerLoop && PHI.getNumIncomingValues() == 2) { + // Check if we have a PHI node in the outer loop that has a reduction + // result from the inner loop as an incoming value. + Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); + PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); + if (!InnerRedPhi || + !llvm::any_of(InnerRedPhi->incoming_values(), + [&PHI](Value *V) { return V == &PHI; })) { + LLVM_DEBUG( + dbgs() + << "Failed to recognize PHI as an induction or reduction.\n"); + return false; + } + Reductions.push_back(&PHI); + } else { LLVM_DEBUG( dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); return false; @@ -778,7 +819,7 @@ PHINode *InnerInductionVar; SmallVector Inductions; SmallVector Reductions; - if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { + if (!findInductionAndReductions(InnerLoop, Inductions, Reductions, nullptr)) { LLVM_DEBUG( dbgs() << "Only inner loops with induction or reduction PHI nodes " << "are supported currently.\n"); @@ -811,7 +852,8 @@ InnerInductionVar = Inductions.pop_back_val(); Reductions.clear(); - if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { + if (!findInductionAndReductions(OuterLoop, Inductions, Reductions, + InnerLoop)) { LLVM_DEBUG( dbgs() << "Only outer loops with induction or reduction PHI nodes " << "are supported currently.\n"); @@ -825,20 +867,6 @@ return true; } - // Outer loop cannot have reduction because then loops will not be tightly - // nested. - if (!Reductions.empty()) { - LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported " - << "currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", - OuterLoop->getStartLoc(), - OuterLoop->getHeader()) - << "Outer loops with reductions cannot be interchangeed " - "currently."; - }); - return true; - } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " @@ -1477,17 +1505,22 @@ // Move the PHI nodes from the inner loop header to the outer loop header. // We have to deal with one kind of PHI nodes: // 1) PHI nodes that are part of a reduction in the outer loop - // We only have to move the PHI node and update the incoming blocks. + // 2) PHI nodes that are part of inner loop-only reductions. + // For 1) we only have to move the PHI node and update the incoming blocks. + // For 2) we can replace the PHI node with the value coming from outside the // inner loop. for (PHINode *PHI : InnerLoopPHIs) { PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); for (BasicBlock *InBB : PHI->blocks()) { - if (InnerLoop->contains(InBB)) + // Skip incoming blocks that either are part of the inner loop or if their + // incoming value is a PHI node. For case 2) those conditions hold for + // all incoming blocks, as the reduction PHI checks earlier require that + // the incoming value to the inner reduction PHI is the outer reduction + // PHI. + if (InnerLoop->contains(InBB) || + isa(PHI->getIncomingValueForBlock(InBB))) continue; - assert(isa(PHI->getIncomingValueForBlock(InBB)) && - "Unexpected incoming PHI node, reductions in outer loop are not " - "supported yet"); PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB)); PHI->eraseFromParent(); break; Index: test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- test/Transforms/LoopInterchange/lcssa.ll +++ test/Transforms/LoopInterchange/lcssa.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -; RUN: cat %t | FileCheck --check-prefix REMARK %s +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S +; RUN: FileCheck --input-file %t --check-prefix REMARK %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll =================================================================== --- /dev/null +++ test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -0,0 +1,151 @@ +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info -stats 2>&1 | FileCheck %s +; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s + + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: test1 + +define i64 @test1([100 x [100 x i64]]* %Arr) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR2_PREHEADER:%.*]] +; CHECK: for1.header.preheader: +; CHECK-NEXT: br label [[FOR1_HEADER:%.*]] +; CHECK: for1.header: +; CHECK-NEXT: [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR1_INC:%.*]] ], [ 0, [[FOR1_HEADER_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SUM_INNER:%.*]] = phi i64 [ [[SUM_INC:%.*]], [[FOR1_INC]] ], [ [[SUM_OUTER:%.*]], [[FOR1_HEADER_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR2_SPLIT1:%.*]] +; CHECK: for2.preheader: +; CHECK-NEXT: br label [[FOR2:%.*]] +; CHECK: for2: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR2_SPLIT:%.*]] ], [ 0, [[FOR2_PREHEADER]] ] +; CHECK-NEXT: [[SUM_OUTER]] = phi i64 [ [[SUM_INC]], [[FOR2_SPLIT]] ], [ 0, [[FOR2_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR1_HEADER_PREHEADER]] +; CHECK: for2.split1: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* [[ARR:%.*]], i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[LV:%.*]] = load i64, i64* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SUM_INC]] = add i64 [[SUM_INNER]], [[LV]] +; CHECK-NEXT: br label [[FOR1_INC]] +; CHECK: for2.split: +; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXIT1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 100 +; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR1_LOOPEXIT:%.*]], label [[FOR2]] +; CHECK: for1.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 +; CHECK-NEXT: [[EXIT2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT24]], 100 +; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR2_SPLIT]], label [[FOR1_HEADER]] +; CHECK: for1.loopexit: +; CHECK-NEXT: [[SUM_INC_LCSSA2:%.*]] = phi i64 [ [[SUM_INC]], [[FOR2_SPLIT]] ] +; CHECK-NEXT: ret i64 [[SUM_INC_LCSSA2]] +; +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %sum.outer, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + +; In this test case, the inner reduction PHI %inner does not involve the outer +; reduction PHI %sum.outer, do not interchange. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test2 + +define i64 @test2([100 x [100 x i64]]* %Arr) { +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %inner = phi i64 [ %indvars.iv23, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + +; Check that we do not interchange if there is an additional instruction +; between the outer and inner reduction PHIs. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test3 + +define i64 @test3([100 x [100 x i64]]* %Arr) { +entry: + br label %for1.header + +for1.header: ; preds = %for1.inc, %entry + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ] + %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ] + %so = add i64 %sum.outer, 10 + br label %for2 + +for2: ; preds = %for2, %for1.header + %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ] + %sum.inner = phi i64 [ %so, %for1.header ], [ %sum.inc, %for2 ] + %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %lv = load i64, i64* %arrayidx, align 4 + %sum.inc = add i64 %sum.inner, %lv + %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1 + %exit1 = icmp eq i64 %indvars.iv.next.3, 100 + br i1 %exit1, label %for1.inc, label %for2 + +for1.inc: ; preds = %for2 + %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %exit2 = icmp eq i64 %indvars.iv.next24, 100 + br i1 %exit2, label %for1.loopexit, label %for1.header + +for1.loopexit: ; preds = %for1.inc + %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ] + ret i64 %sum.inc.lcssa2 +} + + Index: test/Transforms/LoopInterchange/reductions.ll =================================================================== --- test/Transforms/LoopInterchange/reductions.ll +++ test/Transforms/LoopInterchange/reductions.ll @@ -1,5 +1,6 @@ -; REQUIRES: asserts -; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -S -debug 2>&1 | FileCheck %s +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \ +; RUN: -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s +; RUN: FileCheck --input-file=%t %s @A = common global [500 x [500 x i32]] zeroinitializer @X = common global i32 0 @@ -11,7 +12,14 @@ ;; X+=A[j][i]; ;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi. -; CHECK: Loops interchanged. + +; CHECK: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_01 + +; IR-LABEL: @reduction_01( +; IR: for.body3.split define void @reduction_01(i32 %N) { entry: @@ -54,7 +62,14 @@ ;; } ;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi. -; CHECK: Loops interchanged. + +; CHECK: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_02 + +; IR-LABEL: @reduction_02( +; IR: for.body6.split define void @reduction_02(i32 %N) { entry: @@ -115,7 +130,14 @@ ;; Not tightly nested. Do not interchange. ;; Not interchanged hence the phi's in the inner loop will not be split. -; CHECK: Outer loops with reductions are not supported currently. + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: NotTightlyNested +; CHECK-NEXT: Function: reduction_03 + +; IR-LABEL: @reduction_03( +; IR-NOT: split define void @reduction_03(i32 %N) { entry: @@ -174,7 +196,11 @@ ;; } ;; Not interchanged hence the phi's in the inner loop will not be split. -; CHECK: Only inner loops with induction or reduction PHI nodes are supported currently. + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: UnsupportedPHI +; CHECK-NEXT: Function: reduction_04 define void @reduction_04(i32 %N) { entry: @@ -225,7 +251,15 @@ ;; for( int j=1;j