Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11165,6 +11165,13 @@ if (Index->getMemoryVT() != MemVT) break; + // We do not allow under-aligned stores in order to prevent + // overriding stores. NOTE: this is a bad hack. Alignment SHOULD + // be irrelevant here; what MATTERS is that we not move memory + // operations that potentially overlap past each-other. + if (Index->getAlignment()*8 < MemVT.getSizeInBits()) + break; + // We found a potential memory operand to merge. StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); @@ -11249,12 +11256,18 @@ if (StoreNodes.size() < 2) return false; - // Sort the memory operands according to their distance from the base pointer. + // Sort the memory operands according to their distance from the + // base pointer. As a secondary criteria: make sure stores coming + // later in the code come first in the list. This is important for + // the non-UseAA case, because we're merging stores into the FINAL + // store along a chain which potentially contains aliasing stores. + // Thus, if there are multiple stores to the same address, the last + // one can be considered for merging but not the others. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { return LHS.OffsetFromBase < RHS.OffsetFromBase || (LHS.OffsetFromBase == RHS.OffsetFromBase && - LHS.SequenceNum > RHS.SequenceNum); + LHS.SequenceNum < RHS.SequenceNum); }); // Scan the memory operations on the chain and find the first non-consecutive Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -288,12 +288,16 @@ ret void } -;; On x86, even unaligned copies can be merged to vector ops. +;; On x86, even unaligned copies should be merged to vector ops. +;; TODO: however, this cannot happen at the moment, due to brokenness +;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp +;; visitSTORE. + ; CHECK-LABEL: merge_loads_no_align: ; load: -; CHECK: vmovups +; CHECK-NOT: vmovups ;; TODO ; store: -; CHECK: vmovups +; CHECK-NOT: vmovups ;; TODO ; CHECK: ret define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { %a1 = icmp sgt i32 %count, 0 Index: test/CodeGen/X86/dag-merge-fast-accesses.ll =================================================================== --- test/CodeGen/X86/dag-merge-fast-accesses.ll +++ test/CodeGen/X86/dag-merge-fast-accesses.ll @@ -50,11 +50,19 @@ } +;; TODO: FAST *should* be: +;; movups (%rdi), %xmm0 +;; movups %xmm0, 40(%rdi) +;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp +;; visitSTORE. + define void @merge_vec_load_and_stores(i64 *%ptr) { ; FAST-LABEL: merge_vec_load_and_stores: ; FAST: # BB#0: -; FAST-NEXT: movups (%rdi), %xmm0 -; FAST-NEXT: movups %xmm0, 40(%rdi) +; FAST-NEXT: movq (%rdi), %rax +; FAST-NEXT: movq 8(%rdi), %rcx +; FAST-NEXT: movq %rax, 40(%rdi) +; FAST-NEXT: movq %rcx, 48(%rdi) ; FAST-NEXT: retq ; ; SLOW-LABEL: merge_vec_load_and_stores: Index: test/CodeGen/X86/stores-merging.ll =================================================================== --- test/CodeGen/X86/stores-merging.ll +++ test/CodeGen/X86/stores-merging.ll @@ -7,17 +7,51 @@ @e = common global %structTy zeroinitializer, align 4 -; CHECK-LABEL: f -define void @f() { -entry: +;; Ensure that MergeConsecutiveStores doesn't incorrectly reorder +;; store operations. The first test stores in increasing address +;; order, the second in decreasing -- but in both cases should have +;; the same result in memory in the end. -; CHECK: movabsq $528280977409, %rax +; CHECK-LABEL: redundant_stores_merging: +; CHECK: movl $123, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax ; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) - +define void @redundant_stores_merging() { +entry: store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4 store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 ret void } +;; This variant tests PR25154. +; CHECK-LABEL: redundant_stores_merging_reverse: +; CHECK: movl $123, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) +define void @redundant_stores_merging_reverse() { +entry: + store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 + store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 + store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4 + ret void +} + +@b = common global [8 x i8] zeroinitializer, align 2 + +;; The 2-byte store to offset 3 overlaps the 2-byte store to offset 2; +;; these must not be reordered in MergeConsecutiveStores such that the +;; store to 3 comes first (e.g. by merging the stores to 0 and 2 into +;; a movl, after the store to 3). + +;; CHECK-LABEL: overlapping_stores_merging: +;; CHECK: movw $0, b+2(%rip) +;; CHECK: movw $2, b+3(%rip) +;; CHECK: movw $1, b(%rip) +define void @overlapping_stores_merging() { +entry: + store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2 + store i16 2, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 3) to i16*), align 1 + store i16 1, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 0) to i16*), align 2 + ret void +}