Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -105,6 +105,8 @@ STATISTIC(NumLoadStorePromoted, "Number of load and store promotions"); STATISTIC(NumMinMaxHoisted, "Number of min/max expressions hoisted out of the loop"); +STATISTIC(NumAddSubHoisted, + "Number of add/subtract expressions hoisted out of the loop"); /// Memory promotion is enabled by default. static cl::opt @@ -171,7 +173,7 @@ static bool pointerInvalidatedByBlock(BasicBlock &BB, MemorySSA &MSSA, MemoryUse &MU); /// Aggregates various functions for hoisting computations out of loop. -static bool hoistArithmetics(Instruction &I, Loop &L, +static bool hoistArithmetics(Instruction &I, Loop &L, ScalarEvolution &SE, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU); /// Try to simplify things like (A < INV_1 AND icmp A < INV_2) into (A < @@ -179,6 +181,10 @@ /// minimun can be computed outside of loop, and X is not a loop-invariant. static bool hoistMinMax(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU); +/// Try to simplify things like (INV1 - A < INV2) ---> (A > INV1 - INV2), +/// or (A + INV1 < INV2) --> (A < INV2 - INV1) +static bool hoistAddSub(Instruction &I, Loop &L, ScalarEvolution &SE, + ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU); static Instruction *cloneInstructionInExitBlock( Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU); @@ -989,7 +995,7 @@ // Try to reassociate instructions so that part of computations can be // done out of loop. - if (hoistArithmetics(I, *CurLoop, *SafetyInfo, MSSAU)) { + if (hoistArithmetics(I, *CurLoop, *SE, *SafetyInfo, MSSAU)) { Changed = true; continue; } @@ -2495,7 +2501,98 @@ return true; } -static bool hoistArithmetics(Instruction &I, Loop &L, +static bool hoistAddSub(Instruction &I, Loop &L, ScalarEvolution &SE, + ICFLoopSafetyInfo &SafetyInfo, + MemorySSAUpdater &MSSAU) { + using namespace PatternMatch; + ICmpInst::Predicate Pred; + Value *LHS, *RHS; + if (!match(&I, m_ICmp(Pred, m_Value(LHS), m_Value(RHS)))) + return false; + + if (L.isLoopInvariant(LHS)) { + std::swap(LHS, RHS); + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + if (!ICmpInst::isSigned(Pred) || L.isLoopInvariant(LHS) || + !L.isLoopInvariant(RHS) || !LHS->hasOneUse()) + return false; + Value *VariantOp, *InvariantOp; + bool NegateVariantOp = false, NegateInvariantOp = false; + if (match(LHS, m_Sub(m_Value(VariantOp), m_Value(InvariantOp)))) + NegateInvariantOp = true; + else if (!match(LHS, m_Add(m_Value(VariantOp), m_Value(InvariantOp)))) + return false; + + if (L.isLoopInvariant(VariantOp)) { + std::swap(VariantOp, InvariantOp); + std::swap(NegateVariantOp, NegateInvariantOp); + } + + if (L.isLoopInvariant(VariantOp) || !L.isLoopInvariant(InvariantOp)) + return false; + + if (!cast(LHS)->hasNoSignedWrap()) + return false; + + // Now we have the following pattern: + // pow(-1, NegateVariantOp) * VariantOp + + // pow(-1, NegateInvariantOp) * InvariantOp + // RHS. + // The transforms we want to do are the following: + // Step 1: move all invariants to right hand side of inequality. + // pow(-1, NegateVariantOp) * VariantOp + // RHS - + // pow(-1, NegateInvariantOp) * InvariantOp, + // or the same can be rewritten as + // pow(-1, NegateVariantOp) * VariantOp + // RHS + pow(-1, NegateInvariantOp + 1) * InvariantOp. + // Step 2: if variant op is inverted, multiply both sides by -1 (and swap + // predicate if needed.) + // VariantOp + // pow(-1, NegateVariantOp) * RHS + + // pow(-1, NegateVariantOp + NegateInvariantOp + 1) * InvariantOp. + + bool ShouldSwapPredicate = NegateVariantOp; + bool ShouldNegateOp1 = NegateVariantOp; + bool ShouldNegateOp2 = NegateVariantOp ^ NegateInvariantOp ^ true; + assert(!(ShouldNegateOp1 && ShouldNegateOp2) && + "Could not end up negating both operands"); + + Value *Op1 = RHS; + Value *Op2 = InvariantOp; + if (ShouldNegateOp1) { + std::swap(Op1, Op2); + std::swap(ShouldNegateOp1, ShouldNegateOp2); + } + + // Make sure no overflow happens while computing invariant sum. + const SCEV *Op1S = SE.getSCEV(Op1); + const SCEV *Op2S = SE.getSCEV(Op2); + if (ShouldNegateOp2) + Op2S = SE.getNegativeSCEV(Op2S); + if (!SE.willNotOverflow(Instruction::BinaryOps::Add, /*Signed*/ true, Op1S, + Op2S, /*CtxI*/ &I)) + return false; + + if (ShouldSwapPredicate) + Pred = ICmpInst::getSwappedPredicate(Pred); + auto *Preheader = L.getLoopPreheader(); + assert(Preheader && "Loop is not in simplify form?"); + IRBuilder<> Builder(Preheader->getTerminator()); + Value *NewCmpOp = ShouldNegateOp2 + ? Builder.CreateSub(Op1, Op2, "invariant.op") + : Builder.CreateAdd(Op1, Op2, "invariant.op"); + auto *ICmp = cast(&I); + ICmp->setPredicate(Pred); + ICmp->setOperand(0, VariantOp); + ICmp->setOperand(1, NewCmpOp); + cast(LHS)->eraseFromParent(); + return true; +} + +static bool hoistArithmetics(Instruction &I, Loop &L, ScalarEvolution &SE, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU) { // Optimize complex patterns, such as (x < INV1 && x < INV2), turning them @@ -2505,6 +2602,11 @@ ++NumMinMaxHoisted; return true; } + // Optimize complex patterns, (INV1 - a < INV2) --> a > (INV1 - INV2). + if (hoistAddSub(I, L, SE, SafetyInfo, MSSAU)) { + ++NumAddSubHoisted; + return true; + } return false; } Index: llvm/test/Transforms/LICM/hoist-add-sub.ll =================================================================== --- llvm/test/Transforms/LICM/hoist-add-sub.ll +++ llvm/test/Transforms/LICM/hoist-add-sub.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -passes=licm -verify-memoryssa < %s | FileCheck %s -; TODO: x - iv < 4 ==> iv > x - 4 +; x - iv < 4 ==> iv > x - 4 define i32 @test_01(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_01 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG0:![0-9]+]] ; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub i32 [[X]], 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[ARITH:%.*]] = sub nsw i32 [[X]], [[IV]] -; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4 +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp sgt i32 [[IV]], [[INVARIANT_OP]] ; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -114,18 +114,68 @@ ret i32 -2 } -; TODO: x + iv < 4 ==> iv < 4 - x +; Range info is missing for x, cannot prove no-overflow. Should not hoist. +define i32 @test_01_neg(ptr %p, ptr %x_p, ptr %length_p) { +; CHECK-LABEL: define i32 @test_01_neg +; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[ARITH:%.*]] = sub nsw i32 [[X]], [[IV]] +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4 +; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[LENGTH]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_NEXT_LCSSA]] +; CHECK: out_of_bounds: +; CHECK-NEXT: ret i32 -1 +; +entry: + %x = load i32, ptr %x_p + %length = load i32, ptr %length_p, !range !0 + br label %loop + +loop: + %iv = phi i32 [0, %entry], [%iv.next, %backedge] + %arith = sub nsw i32 %x, %iv + %x_check = icmp slt i32 %arith, 4 + br i1 %x_check, label %out_of_bounds, label %backedge + +backedge: + %el.ptr = getelementptr i32, ptr %p, i32 %iv + store i32 1, ptr %el.ptr + %iv.next = add nuw nsw i32 %iv, 4 + %loop_cond = icmp slt i32 %iv.next, %length + br i1 %loop_cond, label %loop, label %exit + +exit: + ret i32 %iv.next + +out_of_bounds: + ret i32 -1 +} + +; x + iv < 4 ==> iv < 4 - x define i32 @test_02(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_02 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG0]] ; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub i32 4, [[X]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[ARITH:%.*]] = add nsw i32 [[X]], [[IV]] -; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4 +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[IV]], [[INVARIANT_OP]] ; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -227,18 +277,18 @@ ret i32 -2 } -; TODO: iv - x < 4 ==> iv < 4 + x +; iv - x < 4 ==> iv < 4 + x define i32 @test_03(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_03 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG1:![0-9]+]] ; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add i32 4, [[X]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[ARITH:%.*]] = sub nsw i32 [[IV]], [[X]] -; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4 +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[IV]], [[INVARIANT_OP]] ; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -340,18 +390,18 @@ ret i32 -2 } -; TODO: iv + x < 4 ==> iv < 4 - x +; iv + x < 4 ==> iv < 4 - x define i32 @test_04(ptr %p, ptr %x_p, ptr %length_p) { ; CHECK-LABEL: define i32 @test_04 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG0]] ; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]] +; CHECK-NEXT: [[INVARIANT_OP:%.*]] = sub i32 4, [[X]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[ARITH:%.*]] = add nsw i32 [[IV]], [[X]] -; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[ARITH]], 4 +; CHECK-NEXT: [[X_CHECK:%.*]] = icmp slt i32 [[IV]], [[INVARIANT_OP]] ; CHECK-NEXT: br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]