Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -309,6 +309,10 @@ // flags from all the reduction operations. FastMathFlags FMF = FastMathFlags::getFast(); + // The first instruction in the use-def chain of the Phi node that requires + // exact floating point operations. + Instruction *ExactFPMathInst = nullptr; + // A value in the reduction can be used: // - By the reduction: // - Reduction operation: @@ -352,6 +356,9 @@ if (Cur != Start) { ReduxDesc = isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF); + ExactFPMathInst = ExactFPMathInst == nullptr + ? ReduxDesc.getExactFPMathInst() + : ExactFPMathInst; if (!ReduxDesc.isRecurrence()) return false; // FIXME: FMF is allowed on phi, but propagation is not handled correctly. @@ -480,8 +487,10 @@ if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; - const bool IsOrdered = checkOrderedReduction( - Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi); + // const bool IsOrdered = checkOrderedReduction( + // Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi); + const bool IsOrdered = + checkOrderedReduction(Kind, ExactFPMathInst, ExitInstruction, Phi); if (Start != Phi) { // If the starting value is not the same as the phi node, we speculatively @@ -538,9 +547,8 @@ // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, - ReduxDesc.getExactFPMathInst(), RecurrenceType, - IsSigned, IsOrdered, CastInsts, + RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst, + RecurrenceType, IsSigned, IsOrdered, CastInsts, MinWidthCastToRecurrenceType); RedDes = RD; Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -68,6 +68,10 @@ // TODO: Check if we can use a sparse matrix here. using CharMatrix = std::vector>; +/// ReductionList contains the reduction descriptors that corresponds +/// to each reduction PHI node, which will be used to determine whether +/// it is safe to reorder floating-point instructions in interchange. +using ReductionList = SmallDenseMap; } // end anonymous namespace // Maximum number of dependencies that can be handled in the dependency matrix. @@ -349,6 +353,11 @@ /// Set of inner loop induction PHIs SmallVector InnerLoopInductions; + + /// Keep track of reduction descriptors to determine whether floating-point + /// PHIs are supported for interchange, i.e., whether the reduction operations + /// can be reordered. + ReductionList RedDesc; }; /// LoopInterchangeProfitability checks if it is profitable to interchange the @@ -723,7 +732,8 @@ } // Check V's users to see if it is involved in a reduction in L. -static PHINode *findInnerReductionPhi(Loop *L, Value *V) { +static PHINode *findInnerReductionPhi(Loop *L, Value *V, + ReductionList &RedDesc) { // Reduction variables cannot be constants. if (isa(V)) return nullptr; @@ -733,8 +743,10 @@ if (PHI->getNumIncomingValues() == 1) continue; RecurrenceDescriptor RD; - if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) { + RedDesc[V] = RD; return PHI; + } return nullptr; } } @@ -766,7 +778,7 @@ // Check if we have a PHI node in the outer loop that has a reduction // result from the inner loop as an incoming value. Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); - PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); + PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V, RedDesc); if (!InnerRedPhi || !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) { LLVM_DEBUG( @@ -882,7 +894,6 @@ } return true; } - // We currently support LCSSA PHI nodes in the outer loop exit, if their // incoming values do not come from the outer loop latch or if the // outer loop latch has a single predecessor. In that case, the value will @@ -890,31 +901,30 @@ // will still be true after interchanging. If we have multiple predecessor, // that may not be the case, e.g. because the outer loop latch may be executed // if the inner loop is not executed. -static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { +static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop, + ReductionList &RedDesc) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { - // FIXME: We currently are not able to detect floating point reductions - // and have to use floating point PHIs as a proxy to prevent - // interchanging in the presence of floating point reductions. - if (PHI.getType()->isFloatingPointTy()) + if (PHI.getType()->isFloatingPointTy() && + RedDesc[followLCSSA(&PHI)].getExactFPMathInst() != nullptr) return false; for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { - Instruction *IncomingI = dyn_cast(PHI.getIncomingValue(i)); - if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) - continue; - - // The incoming value is defined in the outer loop latch. Currently we - // only support that in case the outer loop latch has a single predecessor. - // This guarantees that the outer loop latch is executed if and only if - // the inner loop is executed (because tightlyNested() guarantees that the - // outer loop header only branches to the inner loop or the outer loop - // latch). - // FIXME: We could weaken this logic and allow multiple predecessors, - // if the values are produced outside the loop latch. We would need - // additional logic to update the PHI nodes in the exit block as - // well. - if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) - return false; + Instruction *IncomingI = dyn_cast(PHI.getIncomingValue(i)); + if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) + continue; + + // The incoming value is defined in the outer loop latch. Currently we + // only support that in case the outer loop latch has a single predecessor. + // This guarantees that the outer loop latch is executed if and only if + // the inner loop is executed (because tightlyNested() guarantees that the + // outer loop header only branches to the inner loop or the outer loop + // latch). + // FIXME: We could weaken this logic and allow multiple predecessors, + // if the values are produced outside the loop latch. We would need + // additional logic to update the PHI nodes in the exit block as + // well. + if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) + return false; } } return true; @@ -1038,7 +1048,7 @@ return false; } - if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { + if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop, RedDesc)) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", Index: llvm/test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/lcssa.ll +++ llvm/test/Transforms/LoopInterchange/lcssa.ll @@ -135,9 +135,8 @@ ret void } -; FIXME: We currently do not support LCSSA phi nodes involving floating point -; types, as we fail to detect floating point reductions for now. -; REMARK: UnsupportedPHIOuter +; Loops with floating point reductions are interchanged with fastmath. +; REMARK: Interchanged ; REMARK-NEXT: lcssa_04 define void @lcssa_04() { @@ -146,28 +145,31 @@ outer.header: ; preds = %outer.inc, %entry %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] - %float.outer = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %outer.inc ] + %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.outer.next, %outer.inc ] br label %for.body3 for.body3: ; preds = %for.body3, %outer.header %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %float.inner = phi float [ %float.inner.next, %for.body3 ], [ %float.outer, %outer.header ] %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer %vA = load i32, i32* %arrayidx5 %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer %vC = load i32, i32* %arrayidx9 %add = add nsw i32 %vA, %vC + %float.inner.next = fadd fast float %float.inner, 1.000000e+00 store i32 %add, i32* %arrayidx5 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 %exitcond = icmp eq i64 %iv.inner.next, 100 br i1 %exitcond, label %outer.inc, label %for.body3 outer.inc: ; preds = %for.body3 + %float.outer.next = phi float [ %float.inner.next, %for.body3 ] %iv.outer.next = add nsw i64 %iv.outer, 1 %cmp = icmp eq i64 %iv.outer.next, 100 br i1 %cmp, label %outer.header, label %for.exit for.exit: ; preds = %outer.inc - %float.outer.lcssa = phi float [ %float.outer, %outer.inc ] + %float.outer.lcssa = phi float [ %float.outer.next, %outer.inc ] store float %float.outer.lcssa, float* @F br label %for.end16 Index: llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -189,3 +189,83 @@ %il.res.lcssa2 = phi i64 [ %sum.inc.amend, %for1.inc ] ret i64 %il.res.lcssa2 } + +; Floating point reductions are interchanged if all the fp instructions +; involved allow reassociation. +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: test4 + +define float @test4([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) { +entry: + br label %outer.header + +outer.header: ; preds = %outer.inc, %entry + %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ] + br label %for.body3 + +for.body3: ; preds = %for.body3, %outer.header + %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ] + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load float, float* %arrayidx5 + %float.inner.inc = fadd fast float %float.inner, %vA + %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer + %vB = load float, float* %arrayidx6 + %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: ; preds = %for.body3 + %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ] + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: ; preds = %outer.inc + %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ] + ret float %float.outer.lcssa +} + +; Floating point reductions are not interchanged if not all the fp instructions +; involved allow reassociation. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedExitPHI +; REMARKS-NEXT: Function: test5 + +define float @test5([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) { +entry: + br label %outer.header + +outer.header: ; preds = %outer.inc, %entry + %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ] + br label %for.body3 + +for.body3: ; preds = %for.body3, %outer.header + %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ] + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load float, float* %arrayidx5 + %float.inner.inc = fadd float %float.inner, %vA ; do not allow reassociation + %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer + %vB = load float, float* %arrayidx6 + %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: ; preds = %for.body3 + %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ] + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: ; preds = %outer.inc + %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ] + ret float %float.outer.lcssa +} \ No newline at end of file