Index: llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -613,12 +614,29 @@ } } + // Sink division: (X / Y) * Z --> (X * Z) / Y + Value *FDiv; Value *Z; - if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))), - m_Value(Z)))) { - // Sink division: (X / Y) * Z --> (X * Z) / Y - Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I); - return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I); + if (match(&I, + m_c_FMul(m_CombineAnd(m_Value(FDiv), + m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))), + m_Value(Z)))) { + // Avoid sinking a loop-invariant fdiv into a loop with the fmul. + // This is a hack to avoid putting an expensive math op into a loop that + // it doesn't need to be in. Ideally, we would be able to rely on other + // passes to undo that code movement if profitable, but it might require + // altering the pass pipeline and negatively impacting compile-time. + auto ShouldSink = [&]() { + if (LI) { + if (Loop *L = LI->getLoopFor(I.getParent())) + return !L->isLoopInvariant(FDiv); + } + return true; + }; + if (ShouldSink()) { + Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I); + return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I); + } } // sqrt(X) * sqrt(Y) -> sqrt(X * Y) Index: llvm/test/Transforms/InstCombine/fmul.ll =================================================================== --- llvm/test/Transforms/InstCombine/fmul.ll +++ llvm/test/Transforms/InstCombine/fmul.ll @@ -1051,6 +1051,13 @@ ret float %mul } +; The loop-invariant fdiv is sunk into the loop because this +; test file does not require LoopInfo. This is the expected +; behavior in early invocations of InstCombine. Later, when +; LoopInfo is available, LICM or other passes should move the +; fdiv back outside of the loop, and we don't want to invert that +; in InstCombine. + define void @fmul_loop_invariant_fdiv(float* %a, float %x) { ; CHECK-LABEL: @fmul_loop_invariant_fdiv( ; CHECK-NEXT: entry: Index: llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll +++ llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll @@ -17,19 +17,19 @@ define void @vdiv(ptr %a, float %b) #0 { ; CHECK-LABEL: @vdiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast float 1.000000e+00, [[B:%.*]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast <4 x float> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA3:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP0]] -; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP1]], align 4, !tbaa [[TBAA3]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/PhaseOrdering/lto-licm.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/lto-licm.ll +++ llvm/test/Transforms/PhaseOrdering/lto-licm.ll @@ -4,6 +4,7 @@ define void @hoist_fdiv(ptr %a, float %b) { ; CHECK-LABEL: @hoist_fdiv( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast float 1.000000e+00, [[B:%.*]] ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] @@ -12,9 +13,9 @@ ; CHECK: for.inc: ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fdiv fast float [[TMP0]], [[B:%.*]] -; CHECK-NEXT: store float [[TMP1]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: store float [[TMP2]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 ; CHECK-NEXT: br label [[FOR_COND]] ; CHECK: for.end: