Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12556,9 +12556,6 @@ LoadSDNode *Ld = dyn_cast(St->getValue()); if (!Ld) break; - // Loads must only have one use. - if (!Ld->hasNUsesOfValue(1, 0)) - break; // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) @@ -12568,9 +12565,6 @@ if (Ld->getExtensionType() != ISD::NON_EXTLOAD) break; - // The stored memory type must be the same. - if (Ld->getMemoryVT() != MemVT) - break; BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); // If this is not the first ptr that we check. @@ -12703,8 +12697,24 @@ // Transfer chain users from old loads to the new load. for (unsigned i = 0; i < NumElem; ++i) { LoadSDNode *Ld = cast(LoadNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), - SDValue(NewLoad.getNode(), 1)); + if (SDValue(Ld, 0).hasOneUse()) { + // Only the original store used value so just replace chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + } else { + // Multiple uses exist. Keep the old load in line with the new load. + SDValue Token0 = + DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + // Don't cleanup Ld yet. This changes Token0 first argument to itself. + CombineTo(Ld, SDValue(Ld, 0), Token0, false); + SDValue Token = + DAG.getNode(ISD::TokenFactor, SDLoc(Ld), MVT::Other, SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + // Reset Token0's input from itself to Ld's output chain. + CombineTo(Token0.getNode(), Token); + AddToWorklist(Ld); + } } // Replace the all stores with the new store. Index: test/CodeGen/X86/merge_store_duplicated_loads.ll =================================================================== --- test/CodeGen/X86/merge_store_duplicated_loads.ll +++ test/CodeGen/X86/merge_store_duplicated_loads.ll @@ -1,18 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -o - | FileCheck %s - +; PR32086 target triple = "x86_64-unknown-linux-gnu" define void @merge_double(double* noalias nocapture %st, double* noalias nocapture readonly %ld) #0 { ; CHECK-LABEL: merge_double: ; CHECK: # BB#0: -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movsd %xmm0, (%rdi) -; CHECK-NEXT: movsd %xmm1, 8(%rdi) -; CHECK-NEXT: movsd %xmm0, 16(%rdi) -; CHECK-NEXT: movsd %xmm1, 24(%rdi) +; CHECK-NEXT: movups (%rsi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rdi) +; CHECK-NEXT: movups %xmm0, 16(%rdi) ; CHECK-NEXT: retq %ld_idx1 = getelementptr inbounds double, double* %ld, i64 1 %ld0 = load double, double* %ld, align 8, !tbaa !2 @@ -32,12 +29,9 @@ define void @merge_loadstore_int(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 { ; CHECK-LABEL: merge_loadstore_int: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, 8(%rsi) -; CHECK-NEXT: movq %rax, 16(%rsi) -; CHECK-NEXT: movq %rcx, 24(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) +; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: retq entry: %0 = load i64, i64* %p, align 8, !tbaa !1 @@ -57,11 +51,9 @@ ; CHECK-LABEL: merge_loadstore_int_with_extra_use: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, 8(%rsi) -; CHECK-NEXT: movq %rax, 16(%rsi) -; CHECK-NEXT: movq %rcx, 24(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) +; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: retq entry: %0 = load i64, i64* %p, align 8, !tbaa !1