Index: llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp +++ llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp @@ -356,6 +356,9 @@ unsigned Generation = 0; int MatchingId = -1; bool IsAtomic = false; + + // TODO: Remove this flag. It would be strictly stronger to add a record + // to the AvailableInvariant table when passing the invariant load instead. bool IsInvariant = false; LoadValue() = default; @@ -373,6 +376,17 @@ LoadMapAllocator>; LoadHTType AvailableLoads; + + // A scoped hash table mapping memory locations (represented as typed + // addresses) to generation numbers at which that memory location became + // (henceforth indefinitely) invariant. + using InvariantMapAllocator = + RecyclingAllocator>; + using InvariantHTType = + ScopedHashTable, + InvariantMapAllocator>; + InvariantHTType AvailableInvariants; /// \brief A scoped hash table of the current values of read-only call /// values. @@ -401,15 +415,16 @@ class NodeScope { public: NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - CallHTType &AvailableCalls) - : Scope(AvailableValues), LoadScope(AvailableLoads), - CallScope(AvailableCalls) {} + InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls) + : Scope(AvailableValues), LoadScope(AvailableLoads), + InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {} NodeScope(const NodeScope &) = delete; NodeScope &operator=(const NodeScope &) = delete; private: ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; + InvariantHTType::ScopeTy InvariantScope; CallHTType::ScopeTy CallScope; }; @@ -420,10 +435,13 @@ class StackNode { public: StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, - DomTreeNode::iterator child, DomTreeNode::iterator end) + InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls, + unsigned cg, DomTreeNode *n, DomTreeNode::iterator child, + DomTreeNode::iterator end) : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), - EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls) + EndIter(end), + Scopes(AvailableValues, AvailableLoads, AvailableInvariants, + AvailableCalls) {} StackNode(const StackNode &) = delete; StackNode &operator=(const StackNode &) = delete; @@ -563,6 +581,10 @@ ExpectedType); } + /// Return true if the instruction is known to only operate on memory + /// provably invariant in the given "generation". + bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt); + bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, Instruction *EarlierInst, Instruction *LaterInst); @@ -656,6 +678,27 @@ return MSSA->dominates(LaterDef, EarlierMA); } +bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) { + // A location loaded from with an invariant_load is assumed to *never* change + // within the visible scope of the compilation. + if (auto *LI = dyn_cast(I)) + if (LI->getMetadata(LLVMContext::MD_invariant_load)) + return true; + + auto MemLocOpt = MemoryLocation::getOrNone(I); + if (!MemLocOpt) + // "target" intrinsic forms of loads aren't currently known to + // MemoryLocation::get. TODO + return false; + MemoryLocation MemLoc = *MemLocOpt; + if (!AvailableInvariants.count(MemLoc)) + return false; + + // Is the generation at which this became invariant older than the + // current one? + return AvailableInvariants.lookup(MemLoc) <= GenAt; +} + bool EarlyCSE::processNode(DomTreeNode *Node) { bool Changed = false; BasicBlock *BB = Node->getBlock(); @@ -741,18 +784,28 @@ continue; } - // Skip invariant.start intrinsics since they only read memory, and we can - // forward values across it. Also, we dont need to consume the last store - // since the semantics of invariant.start allow us to perform DSE of the - // last store, if there was a store following invariant.start. Consider: + // We can skip all invariant.start intrinsics since they only read memory, + // and we can forward values across it. For invariant starts without + // invariant ends, we can use the fact that the invariantness never ends to + // start a scope in the current generaton which is true for all future + // generations. Also, we dont need to consume the last store since the + // semantics of invariant.start allow us to perform DSE of the last + // store, if there was a store following invariant.start. Consider: // // store 30, i8* p // invariant.start(p) // store 40, i8* p // We can DSE the store to 30, since the store 40 to invariant location p // causes undefined behaviour. - if (match(Inst, m_Intrinsic())) + if (match(Inst, m_Intrinsic())) { + // If there are any uses, the scope might end. + if (!Inst->use_empty()) + continue; + auto *CI = cast(Inst); + MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI); + AvailableInvariants.insert(MemLoc, CurrentGeneration); continue; + } if (match(Inst, m_Intrinsic())) { if (auto *CondI = @@ -850,7 +903,8 @@ !MemInst.isVolatile() && MemInst.isUnordered() && // We can't replace an atomic load with one which isn't also atomic. InVal.IsAtomic >= MemInst.isAtomic() && - (InVal.IsInvariant || MemInst.isInvariantLoad() || + (InVal.IsInvariant || + isOperatingOnInvariantMemAt(Inst, InVal.Generation) || isSameMemGeneration(InVal.Generation, CurrentGeneration, InVal.DefInst, Inst))) { Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType()); @@ -934,8 +988,9 @@ InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing stores with ordering of any kind. !MemInst.isVolatile() && MemInst.isUnordered() && - isSameMemGeneration(InVal.Generation, CurrentGeneration, - InVal.DefInst, Inst)) { + (isOperatingOnInvariantMemAt(Inst, InVal.Generation) || + isSameMemGeneration(InVal.Generation, CurrentGeneration, + InVal.DefInst, Inst))) { // It is okay to have a LastStore to a different pointer here if MemorySSA // tells us that the load and store are from the same memory generation. // In that case, LastStore should keep its present value since we're @@ -1027,8 +1082,9 @@ // Process the root node. nodesToProcess.push_back(new StackNode( - AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, - DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); + AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls, + CurrentGeneration, DT.getRootNode(), + DT.getRootNode()->begin(), DT.getRootNode()->end())); // Save the current generation. unsigned LiveOutGeneration = CurrentGeneration; @@ -1052,9 +1108,9 @@ // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); nodesToProcess.push_back( - new StackNode(AvailableValues, AvailableLoads, AvailableCalls, - NodeToProcess->childGeneration(), child, child->begin(), - child->end())); + new StackNode(AvailableValues, AvailableLoads, AvailableInvariants, + AvailableCalls, NodeToProcess->childGeneration(), + child, child->begin(), child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. Index: llvm/trunk/test/Transforms/EarlyCSE/invariant-loads.ll =================================================================== --- llvm/trunk/test/Transforms/EarlyCSE/invariant-loads.ll +++ llvm/trunk/test/Transforms/EarlyCSE/invariant-loads.ll @@ -94,3 +94,12 @@ call void @clobber_and_use(i32 %val1) ret void } + +define void @test_false_negative_dse(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_false_negative_dse +; CHECK: store + %v1 = load i32, i32* %p, !invariant.load !{} + call void @clobber_and_use(i32 %v1) + store i32 %v1, i32* %p + ret void +} Index: llvm/trunk/test/Transforms/EarlyCSE/invariant.start.ll =================================================================== --- llvm/trunk/test/Transforms/EarlyCSE/invariant.start.ll +++ llvm/trunk/test/Transforms/EarlyCSE/invariant.start.ll @@ -6,13 +6,12 @@ ; Check that we do load-load forwarding over invariant.start, since it does not ; clobber memory -define i8 @test1(i8 *%P) { - ; CHECK-LABEL: @test1( +define i8 @test_bypass1(i8 *%P) { + ; CHECK-LABEL: @test_bypass1( ; CHECK-NEXT: %V1 = load i8, i8* %P ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) ; CHECK-NEXT: ret i8 0 - %V1 = load i8, i8* %P %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) %V2 = load i8, i8* %P @@ -22,13 +21,12 @@ ; Trivial Store->load forwarding over invariant.start -define i8 @test2(i8 *%P) { - ; CHECK-LABEL: @test2( +define i8 @test_bypass2(i8 *%P) { + ; CHECK-LABEL: @test_bypass2( ; CHECK-NEXT: store i8 42, i8* %P ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) ; CHECK-NEXT: ret i8 42 - store i8 42, i8* %P %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) %V1 = load i8, i8* %P @@ -38,13 +36,11 @@ ; We can DSE over invariant.start calls, since the first store to ; %P is valid, and the second store is actually unreachable based on semantics ; of invariant.start. -define void @test3(i8* %P) { - -; CHECK-LABEL: @test3( +define void @test_bypass3(i8* %P) { +; CHECK-LABEL: @test_bypass3( ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) ; CHECK-NEXT: store i8 60, i8* %P - store i8 50, i8* %P %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) store i8 60, i8* %P @@ -54,9 +50,9 @@ ; FIXME: Now the first store can actually be eliminated, since there is no read within ; the invariant region, between start and end. -define void @test4(i8* %P) { +define void @test_bypass4(i8* %P) { -; CHECK-LABEL: @test4( +; CHECK-LABEL: @test_bypass4( ; CHECK-NEXT: store i8 50, i8* %P ; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %P) ; CHECK-NEXT: call void @llvm.invariant.end.p0i8({}* %i, i64 1, i8* %P) @@ -69,3 +65,214 @@ store i8 60, i8* %P ret void } + + +declare void @clobber() +declare {}* @llvm.invariant.start.p0i32(i64 %size, i32* nocapture %ptr) +declare void @llvm.invariant.end.p0i32({}*, i64, i32* nocapture) nounwind + +define i32 @test_before_load(i32* %p) { +; CHECK-LABEL: @test_before_load +; CHECK: ret i32 0 + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_before_clobber(i32* %p) { +; CHECK-LABEL: @test_before_clobber +; CHECK: ret i32 0 + %v1 = load i32, i32* %p + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_unanalzyable_load(i32* %p) { +; CHECK-LABEL: @test_unanalzyable_load +; CHECK: ret i32 0 + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + call void @clobber() + %v1 = load i32, i32* %p + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_negative_after_clobber(i32* %p) { +; CHECK-LABEL: @test_negative_after_clobber +; CHECK: ret i32 %sub + %v1 = load i32, i32* %p + call void @clobber() + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_merge(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_merge +; CHECK: ret i32 0 + %v1 = load i32, i32* %p + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + br i1 %cnd, label %merge, label %taken + +taken: + call void @clobber() + br label %merge +merge: + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_negative_after_mergeclobber(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_negative_after_mergeclobber +; CHECK: ret i32 %sub + %v1 = load i32, i32* %p + br i1 %cnd, label %merge, label %taken + +taken: + call void @clobber() + br label %merge +merge: + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +; In theory, this version could work, but earlycse is incapable of +; merging facts along distinct paths. +define i32 @test_false_negative_merge(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_false_negative_merge +; CHECK: ret i32 %sub + %v1 = load i32, i32* %p + br i1 %cnd, label %merge, label %taken + +taken: + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + call void @clobber() + br label %merge +merge: + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_merge_unanalyzable_load(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_merge_unanalyzable_load +; CHECK: ret i32 0 + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + call void @clobber() + %v1 = load i32, i32* %p + br i1 %cnd, label %merge, label %taken + +taken: + call void @clobber() + br label %merge +merge: + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define void @test_dse_before_load(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_dse_before_load +; CHECK-NOT: store + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + store i32 %v1, i32* %p + ret void +} + +define void @test_dse_after_load(i32* %p, i1 %cnd) { +; CHECK-LABEL: @test_dse_after_load +; CHECK-NOT: store + %v1 = load i32, i32* %p + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + call void @clobber() + store i32 %v1, i32* %p + ret void +} + + +; In this case, we have a false negative since MemoryLocation is implicitly +; typed due to the user of a Value to represent the address. Note that other +; passes will canonicalize away the bitcasts in this example. +define i32 @test_false_negative_types(i32* %p) { +; CHECK-LABEL: @test_false_negative_types +; CHECK: ret i32 %sub + call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + %pf = bitcast i32* %p to float* + %v2f = load float, float* %pf + %v2 = bitcast float %v2f to i32 + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_negative_size1(i32* %p) { +; CHECK-LABEL: @test_negative_size1 +; CHECK: ret i32 %sub + call {}* @llvm.invariant.start.p0i32(i64 3, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_negative_size2(i32* %p) { +; CHECK-LABEL: @test_negative_size2 +; CHECK: ret i32 %sub + call {}* @llvm.invariant.start.p0i32(i64 0, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_negative_scope(i32* %p) { +; CHECK-LABEL: @test_negative_scope +; CHECK: ret i32 %sub + %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +define i32 @test_false_negative_scope(i32* %p) { +; CHECK-LABEL: @test_false_negative_scope +; CHECK: ret i32 %sub + %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %p) + %v1 = load i32, i32* %p + call void @clobber() + %v2 = load i32, i32* %p + call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %p) + %sub = sub i32 %v1, %v2 + ret i32 %sub +} + +; Invariant load defact starts an invariant.start scope of the appropriate size +define i32 @test_invariant_load_scope(i32* %p) { +; CHECK-LABEL: @test_invariant_load_scope +; CHECK: ret i32 0 + %v1 = load i32, i32* %p, !invariant.load !{} + call void @clobber() + %v2 = load i32, i32* %p + %sub = sub i32 %v1, %v2 + ret i32 %sub +}