Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -221,6 +222,10 @@ cl::init(true), cl::desc("Enable splitting large offset of GEP.")); +static cl::opt +EnableSinkInvariantLoad("cgp-sink-invariant-load", cl::Hidden, cl::init(false), + cl::desc("Enable sink of invariant load to its use")); + namespace { enum ExtType { @@ -355,6 +360,7 @@ bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *Load); + bool optimizeInvariantLoad(LoadInst *Load); bool optimizeSelectInst(SelectInst *SI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI); bool optimizeSwitchInst(SwitchInst *SI); @@ -5544,6 +5550,41 @@ return true; } +/// If Load is invariant then we can sink it closer to users if we find +/// a suitable colder block. +// Finding the colder block requires a domination tree and building it +// for each invariant load seems expensive. For now we support only one use +// and try to sink in the User's basic block if it colder. +// TODO: support more users. +bool CodeGenPrepare::optimizeInvariantLoad(LoadInst *Load) { + // We need BFI to find a best location for load. + if (!BFI) + return false; + if (!Load->getMetadata(LLVMContext::MD_invariant_load)) + return false; + // For simplicity just support only one user. + if (!Load->hasOneUse()) + return false; + Instruction *U = dyn_cast(*Load->user_begin()); + // Cannot insert before Phi node, so need to find a better block. + if (!U || isa(U)) + return false; + BasicBlock *LBB = Load->getParent(); + BasicBlock *UBB = U->getParent(); + if (LBB == UBB) + return false; + if (BFI->getBlockFreq(LBB) <= BFI->getBlockFreq(UBB)) + return false; + // Finally pointer must be dereferenceable. + if (!isDereferenceablePointer(Load->getPointerOperand(), *DL, Load)) + return false; + + LLVM_DEBUG(dbgs() << "Move invariant load " << *Load << " to instruction " + << *U << "\n"); + Load->moveBefore(U); + return true; +} + /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { @@ -6595,6 +6636,8 @@ LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); if (TLI) { bool Modified = optimizeLoadExt(LI); + if (EnableSinkInvariantLoad) + Modified |= optimizeInvariantLoad(LI); unsigned AS = LI->getPointerAddressSpace(); Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); return Modified; Index: test/Transforms/CodeGenPrepare/invariant.load.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/invariant.load.ll @@ -0,0 +1,116 @@ +; RUN: opt -codegenprepare -cgp-sink-invariant-load=true -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin10.0.0" + +; Check that we move load to colder block. +define i8 @test_move_to_cold(i8* dereferenceable(1) %tmp, i1 %c) { +; CHECK-LABEL: @test_move_to_cold +enter: + ; CHECK-LABEL: enter: + ; CHECK-NEXT: br + %val = load i8, i8* %tmp, !invariant.load !0 + br i1 %c, label %left, label %end, !prof !1 +left: + ; CHECK-LABEL: left: + ; CHECK-NEXT: load + ; CHECK-NEXT: %res.left + %res.left = add i8 %val, 1 + br label %end +end: + %res = phi i8 [%res.left, %left], [0, %enter] + ret i8 %res +} + +; Check that we do not move load if pointer is not known to be dereferencable. +define i8 @test_no_derefencable(i8* %tmp, i1 %c) { +; CHECK-LABEL: @test_no_derefencable +enter: + ; CHECK-LABEL: enter: + ; CHECK-NEXT: %val + ; CHECK-NEXT: br + %val = load i8, i8* %tmp, !invariant.load !0 + br i1 %c, label %left, label %end, !prof !1 +left: + ; CHECK-LABEL: left: + ; CHECK-NEXT: %res.left + %res.left = add i8 %val, 1 + br label %end +end: + %res = phi i8 [%res.left, %left], [0, %enter] + ret i8 %res +} + +; Check that we do not move load to hotter block. +define i8 @test_move_to_hot(i8* dereferenceable(1) %tmp, i1 %c) { +; CHECK-LABEL: @test_move_to_hot +enter: + ; CHECK-LABEL: enter: + ; CHECK-NEXT: %val + ; CHECK-NEXT: br + %val = load i8, i8* %tmp, !invariant.load !0 + br i1 %c, label %left, label %end, !prof !2 +left: + %res.left = add i8 %val, 1 + br label %end +end: + %res = phi i8 [%res.left, %left], [0, %enter] + ret i8 %res +} + +; Check that we do not move load if the user in the same block. +define i8 @test_the_same_block(i8* dereferenceable(1) %tmp, i8* dereferenceable(1) %tmp1) { +; CHECK-LABEL: @test_the_same_block +enter: + ; CHECK-LABEL: enter: + ; CHECK-NEXT: %val + ; CHECK-NEXT: %val2 + ; CHECK-NEXT: %res + %val = load i8, i8* %tmp, !invariant.load !0 + %val2 = load i8, i8* %tmp1 + %res = add i8 %val, %val2 + ret i8 %res +} + +; Check that we do not move load if there are more than one use. +define i8 @test_two_uses(i8* dereferenceable(1) %tmp, i8* dereferenceable(1) %tmp1) { +; CHECK-LABEL: @test_two_uses +enter: + ; CHECK-LABEL: enter: + ; CHECK-NEXT: %val + ; CHECK-NEXT: %val2 + ; CHECK-NEXT: %res + ; CHECK-NEXT: %res2 + %val = load i8, i8* %tmp, !invariant.load !0 + %val2 = load i8, i8* %tmp1 + %res = add i8 %val, %val + %res2 = add i8 %res, %val2 + ret i8 %res2 +} + +; Check that we do not move load to phi user. +define i8 @test_phi(i8* dereferenceable(1) %tmp, i8* dereferenceable(1) %tmp1, i1 %c) { +; CHECK-LABEL: @test_phi +enter: + ; CHECK-LABEL: enter: + ; CHECK-NEXT: br + br i1 %c, label %left, label %right, !prof !1 +left: + ; CHECK-LABEL: left: + ; CHECK-NEXT: load + %val = load i8, i8* %tmp, !invariant.load !0 + br label %end +right: + ; CHECK-LABEL: right: + ; CHECK-NEXT: load + %val1 = load i8, i8* %tmp1, !invariant.load !0 + br label %end +end: + ; CHECK-LABEL: end: + %res = phi i8 [%val, %left], [%val1, %right] + ret i8 %res +} + +!0 = !{} +!1 = !{!"branch_weights", i32 1, i32 100} +!2 = !{!"branch_weights", i32 100, i32 1}