Index: llvm/trunk/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LICM.cpp +++ llvm/trunk/lib/Transforms/Scalar/LICM.cpp @@ -81,6 +81,11 @@ DisablePromotion("disable-licm-promotion", cl::Hidden, cl::desc("Disable memory promotion in LICM pass")); +static cl::opt MaxNumUsesTraversed( + "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), + cl::desc("Max num uses visited for identifying load " + "invariance in loop using invariant start (default = 8)")); + static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo); @@ -480,6 +485,59 @@ SafetyInfo->BlockColors = colorEHFunclets(*Fn); } +// Return true if LI is invariant within scope of the loop. LI is invariant if +// CurLoop is dominated by an invariant.start representing the same memory location +// and size as the memory location LI loads from, and also the invariant.start +// has no uses. +static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, + Loop *CurLoop) { + Value *Addr = LI->getOperand(0); + const DataLayout &DL = LI->getModule()->getDataLayout(); + const uint32_t LocSizeInBits = DL.getTypeSizeInBits( + cast(Addr->getType())->getElementType()); + + // if the type is i8 addrspace(x)*, we know this is the type of + // llvm.invariant.start operand + auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()), + LI->getPointerAddressSpace()); + unsigned BitcastsVisited = 0; + // Look through bitcasts until we reach the i8* type (this is invariant.start + // operand type). + while (Addr->getType() != PtrInt8Ty) { + auto *BC = dyn_cast(Addr); + // Avoid traversing high number of bitcast uses. + if (++BitcastsVisited > MaxNumUsesTraversed || !BC) + return false; + Addr = BC->getOperand(0); + } + + unsigned UsesVisited = 0; + // Traverse all uses of the load operand value, to see if invariant.start is + // one of the uses, and whether it dominates the load instruction. + for (auto *U : Addr->users()) { + // Avoid traversing for Load operand with high number of users. + if (++UsesVisited > MaxNumUsesTraversed) + return false; + IntrinsicInst *II = dyn_cast(U); + // If there are escaping uses of invariant.start instruction, the load maybe + // non-invariant. + if (!II || II->getIntrinsicID() != Intrinsic::invariant_start || + II->hasNUsesOrMore(1)) + continue; + unsigned InvariantSizeInBits = + cast(II->getArgOperand(0))->getSExtValue() * 8; + // Confirm the invariant.start location size contains the load operand size + // in bits. Also, the invariant.start should dominate the load, and we + // should not hoist the load out of a loop that contains this dominating + // invariant.start. + if (LocSizeInBits <= InvariantSizeInBits && + DT->properlyDominates(II->getParent(), CurLoop->getHeader())) + return true; + } + + return false; +} + bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo, @@ -496,6 +554,10 @@ if (LI->getMetadata(LLVMContext::MD_invariant_load)) return true; + // This checks for an invariant.start dominating the load. + if (isLoadInvariantInLoop(LI, DT, CurLoop)) + return true; + // Don't hoist loads which have may-aliased stores in loop. uint64_t Size = 0; if (LI->getType()->isSized()) Index: llvm/trunk/test/Transforms/LICM/hoisting.ll =================================================================== --- llvm/trunk/test/Transforms/LICM/hoisting.ll +++ llvm/trunk/test/Transforms/LICM/hoisting.ll @@ -149,3 +149,174 @@ return: ret i32 %sum } + +declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly +declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind +declare void @escaping.invariant.start({}*) nounwind +; invariant.start dominates the load, and in this scope, the +; load is invariant. So, we can hoist the `addrld` load out of the loop. +define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence +; CHECK-LABEL: entry +; CHECK: invariant.start +; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8 +; CHECK: br label %loop +entry: + %gep = getelementptr inbounds i8, i8* %addr, i64 8 + %addr.i = bitcast i8* %gep to i32 * + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep) + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +} + + + +; Same as test above, but the load is no longer invariant (presence of +; invariant.end). We cannot hoist the addrld out of loop. +define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence1 +; CHECK-LABEL: entry +; CHECK: invariant.start +; CHECK-NEXT: invariant.end +; CHECK-NEXT: br label %loop +entry: + %gep = getelementptr inbounds i8, i8* %addr, i64 8 + %addr.i = bitcast i8* %gep to i32 * + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep) + call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep) + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +} + +; same as test above, but instead of invariant.end, we have the result of +; invariant.start escaping through a call. We cannot hoist the load. +define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence2 +; CHECK-LABEL: entry +; CHECK-NOT: load +; CHECK: br label %loop +entry: + %gep = getelementptr inbounds i8, i8* %addr, i64 8 + %addr.i = bitcast i8* %gep to i32 * + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep) + call void @escaping.invariant.start({}* %invst) + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +} + +; FIXME: invariant.start dominates the load, and in this scope, the +; load is invariant. So, we can hoist the `addrld` load out of the loop. +; Consider the loadoperand addr.i bitcasted before being passed to +; invariant.start +define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence3 +; CHECK-LABEL: entry +; CHECK: invariant.start +; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8 +; CHECK: br label %loop +entry: + %addr.i = getelementptr inbounds i32, i32* %addr, i64 8 + %gep = bitcast i32* %addr.i to i8 * + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep) + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +} + +; We should not hoist the addrld out of the loop. +define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence4 +; CHECK-LABEL: entry +; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8 +; CHECK: br label %loop +entry: + %addr.i = getelementptr inbounds i32, i32* %addr, i64 8 + %gep = bitcast i32* %addr.i to i8 * + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep) + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +}