Index: lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- lib/Transforms/Scalar/DeadStoreElimination.cpp +++ lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -49,12 +49,18 @@ STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); STATISTIC(NumCompletePartials, "Number of stores dead by later partials"); +STATISTIC(NumModifiedStores, "Number of stores modified"); static cl::opt EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", cl::init(true), cl::Hidden, cl::desc("Enable partial-overwrite tracking in DSE")); +static cl::opt +EnablePartialStoreMerging("enable-dse-partial-store-merging", + cl::init(true), cl::Hidden, + cl::desc("Enable partial store merging in DSE")); + //===----------------------------------------------------------------------===// // Helper functions @@ -291,6 +297,7 @@ OverwriteBegin, OverwriteComplete, OverwriteEnd, + OverwritePartialEarlierWithFullLater, OverwriteUnknown }; } @@ -432,6 +439,19 @@ } } + // Check for an earlier store which writes to all the memory locations that + // the latter store writes to. + if (EnablePartialStoreMerging && LaterOff >= EarlierOff && + Earlier.Size >= Later.Size && + uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) { + DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff + << ", " << int64_t(EarlierOff + Earlier.Size) + << ") by a later store [" << LaterOff << ", " + << int64_t(LaterOff + Later.Size) << ")\n"); + // TODO: Maybe come up with a better name? + return OverwritePartialEarlierWithFullLater; + } + // Another interesting case is if the later store overwrites the end of the // earlier store. // @@ -1099,6 +1119,8 @@ // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. + // Also try to merge two stores if a latter one only touches memory + // written to by the earlier one. if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; @@ -1128,6 +1150,68 @@ bool IsOverwriteEnd = (OR == OverwriteEnd); MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize, InstWriteOffset, LaterSize, IsOverwriteEnd); + } else if (EnablePartialStoreMerging && + OR == OverwritePartialEarlierWithFullLater) { + auto *Earlier = dyn_cast(DepWrite); + auto *Later = dyn_cast(Inst); + if (Earlier && isa(Earlier->getValueOperand()) && + Later && isa(Later->getValueOperand())) { + // If the store we find is: + // a) partially overwritten by the store to 'Loc' + // b) the latter store is fully contained in the earlier one and + // c) They both have a contant value + // Merge the two stores, replacing the earlier store's value with a + // merge of both values. + // TODO: Deal with other constant types (vectors, etc), and probably + // some mem intrinsics (if needed) + + APInt EarlierValue = + cast(Earlier->getValueOperand())->getValue(); + APInt LaterValue = + cast(Later->getValueOperand())->getValue(); + unsigned LaterBits = LaterValue.getBitWidth(); + assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth()); + LaterValue = LaterValue.zext(EarlierValue.getBitWidth()); + + // Offset of the smaller store inside the larger store + unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8; + unsigned ShiftAmount = + DL.isBigEndian() + ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits + : BitOffsetDiff; + APInt Mask = + APInt::getBitsSet(EarlierValue.getBitWidth(), ShiftAmount, + ShiftAmount + LaterBits); + // Clear the bits we'll be replacing, then OR with the smaller + // store, shifted appropriately. + APInt Merged = (EarlierValue & ~Mask) | (LaterValue << ShiftAmount); + DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite + << "\n Later: " << *Inst + << "\n Merged Value: " << Merged << '\n'); + + auto *SI = new StoreInst( + ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), + Earlier->getPointerOperand(), false, Earlier->getAlignment(), + DepWrite); + SI->copyMetadata(*DepWrite); + ++NumModifiedStores; + + // Replace earlier, wider, store + DepWrite->replaceAllUsesWith(SI); + size_t Idx = InstrOrdering.lookup(DepWrite); + InstrOrdering.erase(DepWrite); + InstrOrdering.insert(std::make_pair(SI, Idx)); + + //// Delete the old stores and now-dead instructions that feed them. + deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, &InstrOrdering); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, + &InstrOrdering); + MadeChange = true; + + //// We erased DepWrite; start over. + InstDep = MD->getDependency(SI); + continue; + } } } Index: test/Transforms/DeadStoreElimination/PartialStore.ll =================================================================== --- test/Transforms/DeadStoreElimination/PartialStore.ll +++ test/Transforms/DeadStoreElimination/PartialStore.ll @@ -12,14 +12,16 @@ ; CHECK-NEXT: store i32 1234567 } -; Note that we could do better by merging the two stores into one. +; Check that we merge these two stores define void @test2(i32* %P) { ; CHECK-LABEL: @test2( store i32 0, i32* %P -; CHECK: store i32 %Q = bitcast i32* %P to i16* store i16 1, i16* %Q -; CHECK: store i16 +; CHECK-NOT: store +; Big endian store, so we get 1<<16 as the merged value +; CHECK: store i32 65536, i32* %P +; CHECK-NOT: store ret void } Index: test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll =================================================================== --- test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll +++ test/Transforms/DeadStoreElimination/combined-partial-overwrites.ll @@ -60,13 +60,9 @@ store i16 2020, i16* %wptrp, align 1 ret void -; CHECK-NOT: store i32 5, i32* %ptr -; CHECK-NOT: store i8 7, i8* %bptr -; CHECK: store i16 -30062, i16* %wptr -; CHECK-NOT: store i8 25, i8* %bptr2 -; CHECK: store i8 47, i8* %bptr3 -; CHECK: store i16 2020, i16* %wptrp, align 1 - +; CHECK-NOT: store +; CHECK: store i32 -1979194321, i32* %ptr +; CHECK-NOT: store ; CHECK: ret void } @@ -94,13 +90,14 @@ store i16 1126, i16* %wptr2, align 1 store i16 5656, i16* %wptr3, align 1 -; CHECK-NOT: store i32 5, i32* %ptr - +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 84280422, i32* %ptr +; All the intermediate stored bytes that are fully contained in the i32 memory +; region get merged to the i32 store. +; CHECK-NOT: store ; CHECK: store i16 1456, i16* %wptrm1, align 1 -; CHECK: store i16 1346, i16* %wptr, align 1 -; CHECK: store i16 1756, i16* %wptr1, align 1 -; CHECK: store i16 1126, i16* %wptr2, align 1 -; CHECK: store i16 5656, i16* %wptr3, align 1 +; CHECK-NEXT: store i16 5656, i16* %wptr3, align 1 +; CHECK-NOT: store ret void @@ -132,7 +129,14 @@ store i16 1126, i16* %wptr2, align 1 store i16 5656, i16* %wptr3, align 1 -; CHECK: store i32 5, i32* %ptr +; CHECK-NEXT: entry: +; Top (first) byte is 0x00 because we load from it in between the two stores. +; CHECK-NEXT: store i32 394342, i32* %ptr +; CHECK-NOT: store +; CHECK: store i16 1456, i16* %wptrm1, align 1 +; CHECK-NEXT: store i16 1346, i16* %wptr, align 1 +; CHECK-NEXT: store i16 5656, i16* %wptr3, align 1 +; CHECK-NOT: store ret i8 %v Index: test/Transforms/DeadStoreElimination/merge-stores.ll =================================================================== --- /dev/null +++ test/Transforms/DeadStoreElimination/merge-stores.ll @@ -0,0 +1,145 @@ +; RUN: opt -dse -enable-dse-partial-store-merging -S < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: @combine_overlapping_stores +define void @combine_overlapping_stores(i32 *%ptr) { +entry: + ;; This store should disappear, it should not be merged since we can't remove + ;; the latter ones (they're not fully contained in this one) and the latter + ;; ones overwrite the whole memory region written to by this store. + store i32 305419896, i32* %ptr ; 0x12345678 + %wptr = bitcast i32* %ptr to i16* + %wptrm1 = getelementptr inbounds i16, i16* %wptr, i64 -1 ; Lower 2B + %wptrp1 = getelementptr inbounds i16, i16* %wptr, i64 1 ; Upper 2B + %ptrL2B = bitcast i16* %wptrm1 to i32* + %ptrU2B = bitcast i16* %wptrp1 to i32* + + store i32 1126266452, i32* %ptrL2B ; 0x43217654 + store i32 1985229328, i32* %ptrU2B ; 0x76543210 + +; CHECK-NEXT: entry: +; CHECK-NOT: store +; CHECK: store i32 1126266452, i32* %ptrL2B +; CHECK-NEXT: store i32 1985229328, i32* %ptrU2B +; CHECK-NOT: store +; CHECK-NEXT: ret void + ret void +} + +; CHECK-LABEL: @byte_by_byte_replacement +define void @byte_by_byte_replacement(i32 *%ptr) { +entry: + ;; This store's value should be modified as it should be better to use larger + ;; stores than smaller ones. + store i32 305419896, i32* %ptr ; 0x12345678 + %bptr = bitcast i32* %ptr to i8* + %bptr1 = getelementptr inbounds i8, i8* %bptr, i64 1 + %bptr2 = getelementptr inbounds i8, i8* %bptr, i64 2 + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + + store i8 9, i8* %bptr + store i8 10, i8* %bptr1 + store i8 11, i8* %bptr2 + store i8 12, i8* %bptr3 + +; CHECK-NEXT: entry: +; CHECK-NOT: store +; CHECK: store i32 202050057, i32* %ptr +; CHECK-NOT: store +; CHECK-NEXT: ret void + ret void +} + +; CHECK-LABEL: @word_replacement +define void @word_replacement(i64 *%ptr) { +entry: + store i64 72623859790382856, i64* %ptr ; 0x0102030405060708 + + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %wptr2 = getelementptr inbounds i16, i16* %wptr, i64 2 + %wptr3 = getelementptr inbounds i16, i16* %wptr, i64 3 + + ;; We should be able to merge these two stores with the i64 one above + store i16 4128, i16* %wptr1 ; 0x1020 + store i16 28800, i16* %wptr3 ; 0x7080 + +; CHECK-NEXT: entry: +; CHECK-NOT: store +; 0x0807201004038070 +; CHECK: store i64 8106482645252179720, i64* %ptr +; CHECK-NOT: store +; CHECK-NEXT: ret void + ret void +} + + +; CHECK-LABEL: @differently_sized_replacements +define void @differently_sized_replacements(i64 *%ptr) { +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr6 = getelementptr inbounds i8, i8* %bptr, i64 6 + %wptr = bitcast i64* %ptr to i16* + %wptr2 = getelementptr inbounds i16, i16* %wptr, i64 2 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (not bytes) stored before ; 08090a0b0c0d0e0f + store i8 7, i8* %bptr6 ; 07 + store i16 1541, i16* %wptr2 ; 0605 + store i32 67305985, i32* %dptr ; 04030201 + +; CHECK-NEXT: entry: +; CHECK-NOT: store +; 0x0807060504030201 +; CHECK: store i64 578437695752307201, i64* %ptr +; CHECK-NOT: store +; CHECK-NEXT: ret void + ret void +} + + +; CHECK-LABEL: @multiple_replacements_to_same_byte +define void @multiple_replacements_to_same_byte(i64 *%ptr) { +entry: + store i64 579005069656919567, i64* %ptr ; 0x08090a0b0c0d0e0f + + %bptr = bitcast i64* %ptr to i8* + %bptr3 = getelementptr inbounds i8, i8* %bptr, i64 3 + %wptr = bitcast i64* %ptr to i16* + %wptr1 = getelementptr inbounds i16, i16* %wptr, i64 1 + %dptr = bitcast i64* %ptr to i32* + + ;; We should be able to merge all these stores with the i64 one above + ; value (not bytes) stored before ; 08090a0b0c0d0e0f + store i8 7, i8* %bptr3 ; 07 + store i16 1541, i16* %wptr1 ; 0605 + store i32 67305985, i32* %dptr ; 04030201 + +; CHECK-NEXT: entry: +; CHECK-NOT: store +; 0x08090a0b04030201 +; CHECK: store i64 579005069522043393, i64* %ptr +; CHECK-NOT: store +; CHECK-NEXT: ret void + ret void +} + +;; Test case from PR31777 +%union.U = type { i64 } + +; CHECK-LABEL: @foo +define void @foo(%union.U* nocapture %u) { +entry: +; CHECK-NOT: store +; CHECK: store i64 42, i64* %i, align 8 +; CHECK-NEXT: ret void + %i = getelementptr inbounds %union.U, %union.U* %u, i64 0, i32 0 + store i64 0, i64* %i, align 8 + %s = bitcast %union.U* %u to i16* + store i16 42, i16* %s, align 8 + ret void +}