Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -733,8 +733,12 @@ if (PHI->getNumIncomingValues() == 1) continue; RecurrenceDescriptor RD; - if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) { + // Detect floating point reduction only when it can be reordered. + if (RD.getExactFPMathInst() != nullptr) + return nullptr; return PHI; + } return nullptr; } } @@ -893,28 +897,23 @@ static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { - // FIXME: We currently are not able to detect floating point reductions - // and have to use floating point PHIs as a proxy to prevent - // interchanging in the presence of floating point reductions. - if (PHI.getType()->isFloatingPointTy()) - return false; for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { - Instruction *IncomingI = dyn_cast(PHI.getIncomingValue(i)); - if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) - continue; - - // The incoming value is defined in the outer loop latch. Currently we - // only support that in case the outer loop latch has a single predecessor. - // This guarantees that the outer loop latch is executed if and only if - // the inner loop is executed (because tightlyNested() guarantees that the - // outer loop header only branches to the inner loop or the outer loop - // latch). - // FIXME: We could weaken this logic and allow multiple predecessors, - // if the values are produced outside the loop latch. We would need - // additional logic to update the PHI nodes in the exit block as - // well. - if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) - return false; + Instruction *IncomingI = dyn_cast(PHI.getIncomingValue(i)); + if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) + continue; + + // The incoming value is defined in the outer loop latch. Currently we + // only support that in case the outer loop latch has a single predecessor. + // This guarantees that the outer loop latch is executed if and only if + // the inner loop is executed (because tightlyNested() guarantees that the + // outer loop header only branches to the inner loop or the outer loop + // latch). + // FIXME: We could weaken this logic and allow multiple predecessors, + // if the values are produced outside the loop latch. We would need + // additional logic to update the PHI nodes in the exit block as + // well. + if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) + return false; } } return true; Index: llvm/test/Transforms/LoopInterchange/lcssa.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/lcssa.ll +++ llvm/test/Transforms/LoopInterchange/lcssa.ll @@ -135,9 +135,8 @@ ret void } -; FIXME: We currently do not support LCSSA phi nodes involving floating point -; types, as we fail to detect floating point reductions for now. -; REMARK: UnsupportedPHIOuter +; Loops with floating point reductions are interchanged with fastmath. +; REMARK: Interchanged ; REMARK-NEXT: lcssa_04 define void @lcssa_04() { @@ -146,28 +145,31 @@ outer.header: ; preds = %outer.inc, %entry %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] - %float.outer = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %outer.inc ] + %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.outer.next, %outer.inc ] br label %for.body3 for.body3: ; preds = %for.body3, %outer.header %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %float.inner = phi float [ %float.inner.next, %for.body3 ], [ %float.outer, %outer.header ] %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer %vA = load i32, i32* %arrayidx5 %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer %vC = load i32, i32* %arrayidx9 %add = add nsw i32 %vA, %vC + %float.inner.next = fadd fast float %float.inner, 1.000000e+00 store i32 %add, i32* %arrayidx5 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 %exitcond = icmp eq i64 %iv.inner.next, 100 br i1 %exitcond, label %outer.inc, label %for.body3 outer.inc: ; preds = %for.body3 + %float.outer.next = phi float [ %float.inner.next, %for.body3 ] %iv.outer.next = add nsw i64 %iv.outer, 1 %cmp = icmp eq i64 %iv.outer.next, 100 br i1 %cmp, label %outer.header, label %for.exit for.exit: ; preds = %outer.inc - %float.outer.lcssa = phi float [ %float.outer, %outer.inc ] + %float.outer.lcssa = phi float [ %float.outer.next, %outer.inc ] store float %float.outer.lcssa, float* @F br label %for.end16 Index: llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll =================================================================== --- llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll +++ llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll @@ -227,3 +227,83 @@ %il.res.lcssa2 = phi i64 [ %sum.inc.amend, %for1.inc ] ret i64 %il.res.lcssa2 } + +; Floating point reductions are interchanged if all the fp instructions +; involved allow reassociation. +; REMARKS: --- !Passed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: Interchanged +; REMARKS-NEXT: Function: test5 + +define float @test5([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) { +entry: + br label %outer.header + +outer.header: ; preds = %outer.inc, %entry + %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ] + br label %for.body3 + +for.body3: ; preds = %for.body3, %outer.header + %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ] + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load float, float* %arrayidx5 + %float.inner.inc = fadd fast float %float.inner, %vA + %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer + %vB = load float, float* %arrayidx6 + %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: ; preds = %for.body3 + %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ] + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: ; preds = %outer.inc + %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ] + ret float %float.outer.lcssa +} + +; Floating point reductions are not interchanged if not all the fp instructions +; involved allow reassociation. +; REMARKS: --- !Missed +; REMARKS-NEXT: Pass: loop-interchange +; REMARKS-NEXT: Name: UnsupportedPHIOuter +; REMARKS-NEXT: Function: test6 + +define float @test6([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) { +entry: + br label %outer.header + +outer.header: ; preds = %outer.inc, %entry + %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ] + %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ] + br label %for.body3 + +for.body3: ; preds = %for.body3, %outer.header + %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ] + %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ] + %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer + %vA = load float, float* %arrayidx5 + %float.inner.inc = fadd float %float.inner, %vA ; do not allow reassociation + %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer + %vB = load float, float* %arrayidx6 + %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB + %iv.inner.next = add nuw nsw i64 %iv.inner, 1 + %exitcond = icmp eq i64 %iv.inner.next, 100 + br i1 %exitcond, label %outer.inc, label %for.body3 + +outer.inc: ; preds = %for.body3 + %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ] + %iv.outer.next = add nsw i64 %iv.outer, 1 + %cmp = icmp eq i64 %iv.outer.next, 100 + br i1 %cmp, label %outer.header, label %for.exit + +for.exit: ; preds = %outer.inc + %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ] + ret float %float.outer.lcssa +} \ No newline at end of file