Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10733,6 +10733,8 @@ LatestNodeUsed = i; } + SmallVector Chains; + // The latest Node in the DAG. LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); @@ -10760,6 +10762,7 @@ if (Val.getValueType() != MemVT) return false; Ops.push_back(Val); + Chains.push_back(St->getChain()); } // Build the extracted vector elements back into a vector. @@ -10779,6 +10782,8 @@ for (unsigned i = 0; i < NumStores; ++i) { unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast(StoreNodes[Idx].MemNode); + Chains.push_back(St->getChain()); + SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes * 8; if (ConstantSDNode *C = dyn_cast(Val)) { @@ -10795,7 +10800,8 @@ StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } - SDValue NewStore = DAG.getStore(LatestOp->getChain(), DL, StoredVal, + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, @@ -11247,6 +11253,9 @@ if (NumElem < 2) return false; + // Collect the chains from all merged stores. + SmallVector MergeStoreChains; + // The latest Node in the DAG. unsigned LatestNodeUsed = 0; for (unsigned i=1; igetChain()); } LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; @@ -11273,12 +11284,17 @@ SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); + // The merged loads are required to have the same chain, so using the first's + // chain is acceptable. SDValue NewLoad = DAG.getLoad( JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), false, false, false, FirstLoadAlign); + SDValue NewStoreChain = + DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + SDValue NewStore = DAG.getStore( - LatestOp->getChain(), StoreDL, NewLoad, FirstInChain->getBasePtr(), + NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, FirstStoreAlign); // Replace one of the loads with the new load. Index: test/CodeGen/X86/2012-11-28-merge-store-alias.ll =================================================================== --- test/CodeGen/X86/2012-11-28-merge-store-alias.ll +++ test/CodeGen/X86/2012-11-28-merge-store-alias.ll @@ -3,8 +3,8 @@ ; CHECK: merge_stores_can ; CHECK: callq foo ; CHECK: xorps %xmm0, %xmm0 -; CHECK-NEXT: movl 36(%rsp), %ebp ; CHECK-NEXT: movups %xmm0 +; CHECK-NEXT: movl 36(%rsp), %ebp ; CHECK: callq foo ; CHECK: ret declare i32 @foo([10 x i32]* ) Index: test/CodeGen/X86/merge-store-partially-alias-loads.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/merge-store-partially-alias-loads.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +; It's OK to merge the load / store of the first 2 components, but +; they must not be placed on the same chain after merging. + +; CHECK-LABEL: {{^}}merge_store_partial_overlap_load: +; CHECK: movw (%rdi), %ax +; CHECK-NEXT: movb 2(%rdi), %cl +; CHECK-NEXT: movw %ax, 1(%rdi) +; CHECK-NEXT: movb %cl, 3(%rdi) +; CHECK-NEXT: retq +define void @merge_store_partial_overlap_load([4 x i8]* %tmp) { + %tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0 + %tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1 + %tmp12 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 2 + %tmp14 = getelementptr [4 x i8], [4 x i8]* %tmp, i32 0, i8 3 + + %tmp9 = load i8, i8* %tmp8, align 1 ; base + 0 + %tmp11 = load i8, i8* %tmp10, align 1 ; base + 1 + %tmp13 = load i8, i8* %tmp12, align 1 ; base + 2 + + store i8 %tmp9, i8* %tmp10, align 1 ; base + 1 + store i8 %tmp11, i8* %tmp12, align 1 ; base + 2 + store i8 %tmp13, i8* %tmp14, align 1 ; base + 3 + +; Should emit +; load base + 0, base + 1 +; store base + 1, base + 2 +; load base + 2 +; store base + 3 + + ret void +}