Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12008,10 +12008,11 @@ if (StoreSDNode *OtherST = dyn_cast(*I)) { if (OtherST->isVolatile() || OtherST->isIndexed()) continue; - // We can merge constant floats to equivalent integers + // We can merge constant or loaded floats to equivalent integers if (OtherST->getMemoryVT() != MemVT) if (!(MemVT.isInteger() && MemVT.bitsEq(OtherST->getMemoryVT()) && - isa(OtherST->getValue()))) + (isa(OtherST->getValue()) || + isa(OtherST->getValue())))) continue; BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG); @@ -12252,9 +12253,6 @@ LoadSDNode *Ld = dyn_cast(St->getValue()); if (!Ld) break; - // Loads must only have one use. - if (!Ld->hasNUsesOfValue(1, 0)) - break; // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) @@ -12264,9 +12262,6 @@ if (Ld->getExtensionType() != ISD::NON_EXTLOAD) break; - // The stored memory type must be the same. - if (Ld->getMemoryVT() != MemVT) - break; BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); // If this is not the first ptr that we check. @@ -12369,13 +12364,8 @@ if (NumElem < 2) return false; - // Collect the chains from all merged stores. Because the common case - // all chains are the same, check if we match the first Chain. - SmallVector MergeStoreChains; - MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); - for (unsigned i = 1; i < NumElem; ++i) - if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain()) - MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); + SDLoc LoadDL(LoadNodes[0].MemNode); + SDLoc StoreDL(StoreNodes[0].MemNode); // Find if it is better to use vectors or integers to load and store // to memory. @@ -12387,8 +12377,15 @@ JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); } - SDLoc LoadDL(LoadNodes[0].MemNode); - SDLoc StoreDL(StoreNodes[0].MemNode); + SmallVector MergeStoreChains; + MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); + for (unsigned i = 1; i < NumElem; ++i) + if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain()) + MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); + + SDValue NewStoreChain = + DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + AddToWorklist(NewStoreChain.getNode()); // The merged loads are required to have the same incoming chain, so // using the first's chain is acceptable. @@ -12396,22 +12393,32 @@ FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoadAlign); - SDValue NewStoreChain = - DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); - - AddToWorklist(NewStoreChain.getNode()); - - SDValue NewStore = - DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); - // Transfer chain users from old loads to the new load. for (unsigned i = 0; i < NumElem; ++i) { LoadSDNode *Ld = cast(LoadNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), - SDValue(NewLoad.getNode(), 1)); + if (SDValue(Ld, 0).hasOneUse()) { + // Only the original store used value so just replace chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + } else { + // Multiple uses exist. Keep the old load in line with the new load. + SDValue Token0 = + DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + // Don't cleanup Ld yet. + CombineTo(Ld, SDValue(Ld, 0), Token0, false); + SDValue Token = + DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + CombineTo(Token0.getNode(), Token); + AddToWorklist(Ld); + } } + SDValue NewStore = + DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), FirstStoreAlign); + // Replace the all stores with the new store. for (unsigned i = 0; i < NumElem; ++i) CombineTo(StoreNodes[i].MemNode, NewStore); Index: test/CodeGen/X86/merge_store_duplicated_loads.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/merge_store_duplicated_loads.ll @@ -0,0 +1,76 @@ +; RUN: llc %s -o - | FileCheck %s + + +target triple = "x86_64-unknown-linux-gnu" + +define void @merge_double(double* noalias nocapture %st, double* noalias nocapture readonly %ld) #0 { + %ld_idx1 = getelementptr inbounds double, double* %ld, i64 1 + %ld0 = load double, double* %ld, align 8, !tbaa !2 + %ld1 = load double, double* %ld_idx1, align 8, !tbaa !2 + + %st_idx1 = getelementptr inbounds double, double* %st, i64 1 + %st_idx2 = getelementptr inbounds double, double* %st, i64 2 + %st_idx3 = getelementptr inbounds double, double* %st, i64 3 + + store double %ld0, double* %st, align 8, !tbaa !2 + store double %ld1, double* %st_idx1, align 8, !tbaa !2 + store double %ld0, double* %st_idx2, align 8, !tbaa !2 + store double %ld1, double* %st_idx3, align 8, !tbaa !2 + ret void +; CHECK-LABEL: @merge_double +; CHECK: movups (%rsi), %xmm0 +; CHECK-DAG: movups %xmm0, (%rdi) +; CHECK-DAG: movups %xmm0, 16(%rdi) +; CHECK: retq +} + +define void @merge_loadstore_int(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 { +entry: + %0 = load i64, i64* %p, align 8, !tbaa !1 + %arrayidx1 = getelementptr inbounds i64, i64* %p, i64 1 + %1 = load i64, i64* %arrayidx1, align 8, !tbaa !1 + store i64 %0, i64* %q, align 8, !tbaa !1 + %arrayidx3 = getelementptr inbounds i64, i64* %q, i64 1 + store i64 %1, i64* %arrayidx3, align 8, !tbaa !1 + %arrayidx4 = getelementptr inbounds i64, i64* %q, i64 2 + store i64 %0, i64* %arrayidx4, align 8, !tbaa !1 + %arrayidx5 = getelementptr inbounds i64, i64* %q, i64 3 + store i64 %1, i64* %arrayidx5, align 8, !tbaa !1 + ret void +; CHECK-LABEL: @merge_loadstore_int +; CHECK: movups (%rdi), %xmm0 +; CHECK-DAG: movups %xmm0, (%rsi) +; CHECK-DAG: movups %xmm0, 16(%rsi) +; CHECK: retq +} + +define i64 @merge_loadstore_int_with_extra_use(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 { +entry: + %0 = load i64, i64* %p, align 8, !tbaa !1 + %arrayidx1 = getelementptr inbounds i64, i64* %p, i64 1 + %1 = load i64, i64* %arrayidx1, align 8, !tbaa !1 + store i64 %0, i64* %q, align 8, !tbaa !1 + %arrayidx3 = getelementptr inbounds i64, i64* %q, i64 1 + store i64 %1, i64* %arrayidx3, align 8, !tbaa !1 + %arrayidx4 = getelementptr inbounds i64, i64* %q, i64 2 + store i64 %0, i64* %arrayidx4, align 8, !tbaa !1 + %arrayidx5 = getelementptr inbounds i64, i64* %q, i64 3 + store i64 %1, i64* %arrayidx5, align 8, !tbaa !1 + ret i64 %0 +; CHECK-LABEL: @merge_loadstore_int_with_extra_use +; CHECK-DAG: movq (%rdi), %rax +; CHECK-DAG: movups (%rdi), %xmm0 +; CHECK-DAG: movups %xmm0, (%rsi) +; CHECK-DAG: movups %xmm0, 16(%rsi) +; CHECK: retq + +} + +attributes #0 = { "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } + + +!0 = !{!"clang version 5.0.0 (trunk 296467) (llvm/trunk 296476)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"double", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"}