Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -449,6 +449,11 @@ EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, bool &NarrowLoad); + //// Helper function for MergeConsecutiveStores which merges the + //// component store chains. + SDValue getMergeStoreChains(SmallVectorImpl &StoreNodes, + unsigned NumStores); + /// This is a helper function for MergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store. @@ -12071,6 +12076,26 @@ return false; } +SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl &StoreNodes, + unsigned NumStores) { + SmallVector Chains; + SmallPtrSet Visited; + SDLoc StoreDL(StoreNodes[0].MemNode); + + for (unsigned i = 0; i < NumStores; ++i) { + Visited.insert(StoreNodes[i].MemNode); + } + + // don't include nodes that are children + for (unsigned i = 0; i < NumStores; ++i) { + if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) + Chains.push_back(StoreNodes[i].MemNode->getChain()); + } + + assert(Chains.size() > 0 && "Chain should have generated a chain"); + return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); +} + bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector) { @@ -12153,17 +12178,8 @@ StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } - SmallVector Chains; - - // Gather all Chains we're inheriting. As generally all chains are - // equal, do minor check to remove obvious redundancies. - Chains.push_back(StoreNodes[0].MemNode->getChain()); - for (unsigned i = 1; i < NumStores; ++i) - if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain()) - Chains.push_back(StoreNodes[i].MemNode->getChain()); - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), @@ -12192,6 +12208,54 @@ if (BasePtr.Base.isUndef()) return; + bool IsLoadSrc = isa(St->getValue()); + bool IsConstantSrc = isa(St->getValue()) || + isa(St->getValue()); + bool IsExtractVecSrc = + (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); + auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool { + if (Other->isVolatile() || Other->isIndexed()) + return false; + // We can merge constant floats to equivalent integers + if (Other->getMemoryVT() != MemVT) + if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && + isa(Other->getValue()))) + return false; + Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); + if (!Ptr.equalBaseIndex(BasePtr)) + return false; + if (IsLoadSrc) + return isa(Other->getValue()); + if (IsConstantSrc) + return (MemVT.isInteger() || MemVT == Other->getMemoryVT()) && + (isa(Other->getValue()) || + isa(Other->getValue())); + if (IsExtractVecSrc) + return (MemVT == Other->getMemoryVT() && + (Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)); + return false; + }; + + // In the case FindBetterChain gives up we will find chains of + // consecutive stores. Do a preprocessing step to walk up the chain + // leaving the the head of the chain for the parallel search. + + StoreSDNode *STChain = St; + int64_t LastOffset = BasePtr.Offset; + while (StoreSDNode *Chain = dyn_cast(STChain->getChain())) { + BaseIndexOffset Ptr; + //Check Chain. + if (!CandidateMatch(Chain, Ptr) || Ptr.Offset > LastOffset) + break; + StoreNodes.push_back(MemOpLink(STChain, LastOffset)); + LastOffset = Ptr.Offset; + STChain = Chain; + } + + SDNode *RootNode = (STChain->getChain()).getNode(); + // We looking for a root node which is an ancestor to all mergable // stores. We search up through a load, to our root and then down // through all children. For instance we will find Store{1,2,3} if @@ -12203,13 +12267,11 @@ // |-------|-------| // Load Load Store3 // | | - // Store1 Store2 + // Store1 Store2 // // FIXME: We should be able to climb and // descend TokenFactors to find candidates as well. - SDNode *RootNode = (St->getChain()).getNode(); - // Set of Parents of Candidates std::set CandidateParents; @@ -12221,39 +12283,13 @@ } else CandidateParents.insert(RootNode); - bool IsLoadSrc = isa(St->getValue()); - bool IsConstantSrc = isa(St->getValue()) || - isa(St->getValue()); - bool IsExtractVecSrc = - (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || - St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); - auto CorrectValueKind = [&](StoreSDNode *Other) -> bool { - if (IsLoadSrc) - return isa(Other->getValue()); - if (IsConstantSrc) - return (isa(Other->getValue()) || - isa(Other->getValue())); - if (IsExtractVecSrc) - return (Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || - Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); - return false; - }; - // check all parents of mergable children for (auto P = CandidateParents.begin(); P != CandidateParents.end(); ++P) for (auto I = (*P)->use_begin(), E = (*P)->use_end(); I != E; ++I) if (I.getOperandNo() == 0) if (StoreSDNode *OtherST = dyn_cast(*I)) { - if (OtherST->isVolatile() || OtherST->isIndexed()) - continue; - // We can merge constant floats to equivalent integers - if (OtherST->getMemoryVT() != MemVT) - if (!(MemVT.isInteger() && MemVT.bitsEq(OtherST->getMemoryVT()) && - isa(OtherST->getValue()))) - continue; - BaseIndexOffset Ptr = - BaseIndexOffset::match(OtherST->getBasePtr(), DAG); - if (Ptr.equalBaseIndex(BasePtr) && CorrectValueKind(OtherST)) + BaseIndexOffset Ptr; + if (CandidateMatch(OtherST, Ptr)) StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset)); } } @@ -12494,9 +12530,6 @@ LoadSDNode *Ld = dyn_cast(St->getValue()); if (!Ld) break; - // Loads must only have one use. - if (!Ld->hasNUsesOfValue(1, 0)) - break; // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) @@ -12506,9 +12539,6 @@ if (Ld->getExtensionType() != ISD::NON_EXTLOAD) break; - // The stored memory type must be the same. - if (Ld->getMemoryVT() != MemVT) - break; BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); // If this is not the first ptr that we check. @@ -12611,14 +12641,6 @@ if (NumElem < 2) return false; - // Collect the chains from all merged stores. Because the common case - // all chains are the same, check if we match the first Chain. - SmallVector MergeStoreChains; - MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); - for (unsigned i = 1; i < NumElem; ++i) - if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain()) - MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); - // Find if it is better to use vectors or integers to load and store // to memory. EVT JointMemOpVT; @@ -12638,8 +12660,7 @@ FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoadAlign); - SDValue NewStoreChain = - DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); @@ -12650,8 +12671,24 @@ // Transfer chain users from old loads to the new load. for (unsigned i = 0; i < NumElem; ++i) { LoadSDNode *Ld = cast(LoadNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), - SDValue(NewLoad.getNode(), 1)); + if (SDValue(Ld, 0).hasOneUse()) { + // Only the original store used value so just replace chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + } else { + // Multiple uses exist. Keep the old load in line with the new load. + SDValue Token0 = + DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + // Don't cleanup Ld yet. This changes Token0 first argument to itself. + CombineTo(Ld, SDValue(Ld, 0), Token0, false); + SDValue Token = + DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + // Reset Token0's input from itself to Ld's output chain. + CombineTo(Token0.getNode(), Token); + AddToWorklist(Ld); + } } // Replace the all stores with the new store. Index: test/CodeGen/X86/merge_store_duplicated_loads.ll =================================================================== --- test/CodeGen/X86/merge_store_duplicated_loads.ll +++ test/CodeGen/X86/merge_store_duplicated_loads.ll @@ -1,18 +1,16 @@ + ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -o - | FileCheck %s - +; PR32086 target triple = "x86_64-unknown-linux-gnu" define void @merge_double(double* noalias nocapture %st, double* noalias nocapture readonly %ld) #0 { ; CHECK-LABEL: merge_double: ; CHECK: # BB#0: -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movsd %xmm0, (%rdi) -; CHECK-NEXT: movsd %xmm1, 8(%rdi) -; CHECK-NEXT: movsd %xmm0, 16(%rdi) -; CHECK-NEXT: movsd %xmm1, 24(%rdi) +; CHECK-NEXT: movups (%rsi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rdi) +; CHECK-NEXT: movups %xmm0, 16(%rdi) ; CHECK-NEXT: retq %ld_idx1 = getelementptr inbounds double, double* %ld, i64 1 %ld0 = load double, double* %ld, align 8, !tbaa !2 @@ -32,12 +30,9 @@ define void @merge_loadstore_int(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 { ; CHECK-LABEL: merge_loadstore_int: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, 8(%rsi) -; CHECK-NEXT: movq %rax, 16(%rsi) -; CHECK-NEXT: movq %rcx, 24(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) +; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: retq entry: %0 = load i64, i64* %p, align 8, !tbaa !1 @@ -57,11 +52,9 @@ ; CHECK-LABEL: merge_loadstore_int_with_extra_use: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, 8(%rsi) -; CHECK-NEXT: movq %rax, 16(%rsi) -; CHECK-NEXT: movq %rcx, 24(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) +; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: retq entry: %0 = load i64, i64* %p, align 8, !tbaa !1 @@ -75,7 +68,6 @@ %arrayidx5 = getelementptr inbounds i64, i64* %q, i64 3 store i64 %1, i64* %arrayidx5, align 8, !tbaa !1 ret i64 %0 - } attributes #0 = { "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } Index: test/CodeGen/X86/stores-merging2.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/stores-merging2.ll @@ -0,0 +1,44 @@ +; RUN: llc %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + + + +@ptr = global [32 x i32] zeroinitializer, align 4 +@strptr = global [4 x i8] zeroinitializer, align 1 + +; FindBetterChains in SelectionDAG should give up and the 4 1-byte +; stores will remain in a chain. We should still be able to merge them +; into a 32-bit store. + +; CHECK-LABEL: @foo +; CHECK: movl $1684234849, strpt + +define i32 @foo() { + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 0) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 1) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 2) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 3) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 4) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 5) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 6) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 7) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 8) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 9) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 10) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 11) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 12) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 13) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 14) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 15) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 16) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 17) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 18) + store i32 0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @ptr, i32 0, i32 19) + store i8 97, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 0) + store i8 98, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 1) + store i8 99, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 2) + store i8 100, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @strptr, i32 0, i32 3) + ret i32 0 +}