diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1068,58 +1068,90 @@ return MadeChange; } -static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset, - uint64_t &EarlierSize, int64_t LaterOffset, +static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart, + uint64_t &EarlierSize, int64_t LaterStart, uint64_t LaterSize, bool IsOverwriteEnd) { - // TODO: base this on the target vector size so that if the earlier - // store was too small to get vector writes anyway then its likely - // a good idea to shorten it - // Power of 2 vector writes are probably always a bad idea to optimize - // as any store/memset/memcpy is likely using vector instructions so - // shortening it to not vector size is likely to be slower auto *EarlierIntrinsic = cast(EarlierWrite); - unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment(); - if (!IsOverwriteEnd) - LaterOffset = int64_t(LaterOffset + LaterSize); - - if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) && - !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0)) - return false; + Align PrefAlign = EarlierIntrinsic->getDestAlign().valueOrOne(); + + // We assume that memet/memcpy operates in chunks of the "largest" native + // type size and aligned on the same value. That means optimal start and size + // of memset/memcpy should be modulo of preferred alignment of that type. That + // is it there is no any sense in trying to reduce store size any further + // since any "extra" stores comes for free anyway. + // On the other hand, maximum alignment we can achieve is limited by alignment + // of initial store. + + // TODO: Limit maximum alignment by preferred (or abi?) alignment of the + // "largest" native type. + // Note: What is the proper way to get that value? + // Should TargetTransformInfo::getRegisterBitWidth be used or anything else? + // PrefAlign = std::min(DL.getPrefTypeAlign(LargestType), PrefAlign); + + int64_t ToRemoveStart = 0; + uint64_t ToRemoveSize = 0; + // Compute start and size of the region to remove. Make sure 'PrefAlign' is + // maintained on the remaining store. + if (IsOverwriteEnd) { + uint64_t Off = + offsetToAlignment(uint64_t(LaterStart - EarlierStart), PrefAlign); + ToRemoveStart = LaterStart + Off; + if (EarlierSize <= uint64_t(ToRemoveStart - EarlierStart)) + return false; + ToRemoveSize = EarlierSize - uint64_t(ToRemoveStart - EarlierStart); + } else { + ToRemoveStart = EarlierStart; + assert(LaterSize >= uint64_t(EarlierStart - LaterStart) && + "Not overlapping accesses?"); + ToRemoveSize = LaterSize - uint64_t(EarlierStart - LaterStart); + uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign); + if (Off != 0) { + if (ToRemoveSize <= (PrefAlign.value() - Off)) + return false; + ToRemoveSize -= PrefAlign.value() - Off; + } + assert(isAligned(PrefAlign, ToRemoveSize) && + "Should preserve selected alignment"); + } - int64_t NewLength = IsOverwriteEnd - ? LaterOffset - EarlierOffset - : EarlierSize - (LaterOffset - EarlierOffset); + assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove"); + assert(EarlierSize > ToRemoveSize && "Can't remove more than original size"); + uint64_t NewSize = EarlierSize - ToRemoveSize; if (auto *AMI = dyn_cast(EarlierWrite)) { // When shortening an atomic memory intrinsic, the newly shortened // length must remain an integer multiple of the element size. const uint32_t ElementSize = AMI->getElementSizeInBytes(); - if (0 != NewLength % ElementSize) + if (0 != NewSize % ElementSize) return false; } LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " << (IsOverwriteEnd ? "END" : "BEGIN") << ": " - << *EarlierWrite << "\n KILLER (offset " << LaterOffset - << ", " << EarlierSize << ")\n"); + << *EarlierWrite << "\n KILLER ( offset " << ToRemoveStart + << ", " << ToRemoveSize << ")\n"); Value *EarlierWriteLength = EarlierIntrinsic->getLength(); Value *TrimmedLength = - ConstantInt::get(EarlierWriteLength->getType(), NewLength); + ConstantInt::get(EarlierWriteLength->getType(), NewSize); EarlierIntrinsic->setLength(TrimmedLength); + EarlierIntrinsic->setDestAlignment(PrefAlign); - EarlierSize = NewLength; if (!IsOverwriteEnd) { - int64_t OffsetMoved = (LaterOffset - EarlierOffset); - Value *Indices[1] = { - ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)}; + Value *Indices[1] = { ConstantInt::get(EarlierWriteLength->getType(), + ToRemoveSize) }; GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds( EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(), EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite); NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc()); EarlierIntrinsic->setDest(NewDestGEP); - EarlierOffset = EarlierOffset + OffsetMoved; } + + // Finally update start and size of earlier access. + if (!IsOverwriteEnd) + EarlierStart += ToRemoveSize; + EarlierSize = NewSize; + return true; } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll @@ -234,12 +234,13 @@ ret void } -define void @dontwrite2to9(i32* nocapture %p) { -; CHECK-LABEL: @dontwrite2to9( +define void @write2to10(i32* nocapture %p) { +; CHECK-LABEL: @write2to10( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1 ; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64* @@ -257,12 +258,13 @@ ret void } -define void @dontwrite2to9_atomic(i32* nocapture %p) { -; CHECK-LABEL: @dontwrite2to9_atomic( +define void @write2to10_atomic(i32* nocapture %p) { +; CHECK-LABEL: @write2to10_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i32 4) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1 ; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64* @@ -391,3 +393,55 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32) nounwind +define void @ow_begin_align1(i8* nocapture %p) { +; CHECK-LABEL: @ow_begin_align1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP0]], i8 0, i64 25, i1 false) +; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P2]], align 1 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds i8, i8* %p, i64 1 + call void @llvm.memset.p0i8.i64(i8* align 1 %p1, i8 0, i64 32, i1 false) + %p2 = bitcast i8* %p to i64* + store i64 1, i64* %p2, align 1 + ret void +} + +define void @ow_end_align4(i8* nocapture %p) { +; CHECK-LABEL: @ow_end_align4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false) +; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P2]], align 1 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds i8, i8* %p, i64 1 + call void @llvm.memset.p0i8.i64(i8* align 4 %p1, i8 0, i64 32, i1 false) + %p2 = bitcast i8* %p to i64* + store i64 1, i64* %p2, align 1 + ret void +} + +define void @ow_end_align8(i8* nocapture %p) { +; CHECK-LABEL: @ow_end_align8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P1]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[P2:%.*]] = bitcast i8* [[P]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P2]], align 1 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds i8, i8* %p, i64 1 + call void @llvm.memset.p0i8.i64(i8* align 8 %p1, i8 0, i64 32, i1 false) + %p2 = bitcast i8* %p to i64* + store i64 1, i64* %p2, align 1 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll @@ -388,3 +388,61 @@ store i64 3, i64* %base64_3, align 8 ret void } + +define void @ow_end_align1(i8* nocapture %p) { +; CHECK-LABEL: @ow_end_align1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[P1]], i8 0, i64 27, i1 false) +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27 +; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds i8, i8* %p, i64 1 + call void @llvm.memset.p0i8.i64(i8* align 1 %p1, i8 0, i64 32, i1 false) + %p2 = getelementptr inbounds i8, i8* %p1, i64 27 + %p2.i64 = bitcast i8* %p2 to i64* + store i64 1, i64* %p2.i64, align 1 + ret void +} + +define void @ow_end_align4(i8* nocapture %p) { +; CHECK-LABEL: @ow_end_align4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P1]], i8 0, i64 28, i1 false) +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27 +; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds i8, i8* %p, i64 1 + call void @llvm.memset.p0i8.i64(i8* align 4 %p1, i8 0, i64 32, i1 false) + %p2 = getelementptr inbounds i8, i8* %p1, i64 27 + %p2.i64 = bitcast i8* %p2 to i64* + store i64 1, i64* %p2.i64, align 1 + ret void +} + +define void @ow_end_align8(i8* nocapture %p) { +; CHECK-LABEL: @ow_end_align8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i64 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[P1]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 27 +; CHECK-NEXT: [[P2_I64:%.*]] = bitcast i8* [[P2]] to i64* +; CHECK-NEXT: store i64 1, i64* [[P2_I64]], align 1 +; CHECK-NEXT: ret void +; +entry: + %p1 = getelementptr inbounds i8, i8* %p, i64 1 + call void @llvm.memset.p0i8.i64(i8* align 8 %p1, i8 0, i64 32, i1 false) + %p2 = getelementptr inbounds i8, i8* %p1, i64 27 + %p2.i64 = bitcast i8* %p2 to i64* + store i64 1, i64* %p2.i64, align 1 + ret void +} + diff --git a/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll --- a/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MemDepAnalysis/OverwriteStoreBegin.ll @@ -234,12 +234,13 @@ ret void } -define void @dontwrite2to9(i32* nocapture %p) { -; CHECK-LABEL: @dontwrite2to9( +define void @write2to10(i32* nocapture %p) { +; CHECK-LABEL: @write2to10( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i1 false) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1 ; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64* @@ -257,12 +258,13 @@ ret void } -define void @dontwrite2to9_atomic(i32* nocapture %p) { -; CHECK-LABEL: @dontwrite2to9_atomic( +define void @write2to10_atomic(i32* nocapture %p) { +; CHECK-LABEL: @write2to10_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 28, i32 4) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i16* ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[P4]], i64 1 ; CHECK-NEXT: [[P5:%.*]] = bitcast i16* [[ARRAYIDX2]] to i64*