diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -501,28 +501,40 @@ if (BP1 != BP2) return OW_Unknown; - // The later store completely overlaps the earlier store if: - // - // 1. Both start at the same offset and the later one's size is greater than - // or equal to the earlier one's, or - // - // |--earlier--| - // |-- later --| - // - // 2. The earlier store has an offset greater than the later offset, but which - // still lies completely within the later store. - // - // |--earlier--| - // |----- later ------| + // The later access completely overlaps the earlier store if and only if + // both start and end of the earlier one is "inside" the later one: + // |<->|--earlier--|<->| + // |-------later-------| + // Accesses may overlap if and only if start of one of them is "inside" + // another one: + // |<->|--earlier--|<----->| + // |-------later-------| + // OR + // |----- earlier -----| + // |<->|---later---|<----->| // // We have to be careful here as *Off is signed while *.Size is unsigned. - if (EarlierOff >= LaterOff && - LaterSize >= EarlierSize && - uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize) - return OW_Complete; - // Later may overwrite earlier completely with other partial writes. - return OW_MaybePartial; + // Check if the earlier access starts "not before" the later one. + if (EarlierOff >= LaterOff) { + // If the earlier access ends "not after" the later access then the earlier + // one is completely overwritten by the later one. + if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize) + return OW_Complete; + // If start of the earlier access is "before" end of the later access then + // accesses overlap. + else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize) + return OW_MaybePartial; + } + // If start of the later access is "before" end of the earlier access then + // accesses overlap. + else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) { + return OW_MaybePartial; + } + + // Can reach here only if accesses are known not to overlap. There is no + // dedicated code to indicate no overlap so signal "unknown". + return OW_Unknown; } /// Return 'OW_Complete' if a store to the 'Later' location completely diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s -; RUN: opt -S -dse -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s +; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux" @@ -213,41 +212,21 @@ ; We miss this case, because of an aggressive limit of partial overlap analysis. ; With a larger partial store limit, we remove the memset. define void @test4() { -; DEFAULT-LIMIT-LABEL: @test4( -; DEFAULT-LIMIT-NEXT: entry: -; DEFAULT-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 -; DEFAULT-LIMIT-NEXT: [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8* -; DEFAULT-LIMIT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32 -; DEFAULT-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false) -; DEFAULT-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 -; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 -; DEFAULT-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 -; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 -; DEFAULT-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 -; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 -; DEFAULT-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 -; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 -; DEFAULT-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 -; DEFAULT-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 -; DEFAULT-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) -; DEFAULT-LIMIT-NEXT: ret void -; -; LARGER-LIMIT-LABEL: @test4( -; LARGER-LIMIT-NEXT: entry: -; LARGER-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 -; LARGER-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 -; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 -; LARGER-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 -; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 -; LARGER-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 -; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 -; LARGER-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 -; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 -; LARGER-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 -; LARGER-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 -; LARGER-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) -; LARGER-LIMIT-NEXT: ret void -; +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8 +; CHECK-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0 +; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8 +; CHECK-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1 +; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8 +; CHECK-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2 +; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8 +; CHECK-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3 +; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8 +; CHECK-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4 +; CHECK-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8 +; CHECK-NEXT: call void @goFunc(%struct.foostruct* [[BANG]]) +; CHECK-NEXT: ret void entry: %bang = alloca %struct.foostruct, align 8 diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -dse %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s -; RUN: opt -dse -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s +; RUN: opt -dse %s -S | FileCheck --check-prefixes=CHECK %s %struct.ham = type { [3 x double], [3 x double]} @@ -11,52 +10,27 @@ ; We miss this case, because of an aggressive limit of partial overlap analysis. ; With a larger partial store limit, we remove the memset. define void @overlap1(%struct.ham* %arg, i1 %cond) { -; DEFAULT-LIMIT-LABEL: @overlap1( -; DEFAULT-LIMIT-NEXT: bb: -; DEFAULT-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 -; DEFAULT-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 -; DEFAULT-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 -; DEFAULT-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 -; DEFAULT-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 -; DEFAULT-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 -; DEFAULT-LIMIT-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8* -; DEFAULT-LIMIT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32 -; DEFAULT-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false) -; DEFAULT-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] -; DEFAULT-LIMIT: bb7: -; DEFAULT-LIMIT-NEXT: br label [[BB9:%.*]] -; DEFAULT-LIMIT: bb8: -; DEFAULT-LIMIT-NEXT: br label [[BB9]] -; DEFAULT-LIMIT: bb9: -; DEFAULT-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 -; DEFAULT-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 -; DEFAULT-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 -; DEFAULT-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 -; DEFAULT-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 -; DEFAULT-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 -; DEFAULT-LIMIT-NEXT: ret void -; -; LARGER-LIMIT-LABEL: @overlap1( -; LARGER-LIMIT-NEXT: bb: -; LARGER-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 -; LARGER-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 -; LARGER-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 -; LARGER-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 -; LARGER-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 -; LARGER-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 -; LARGER-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] -; LARGER-LIMIT: bb7: -; LARGER-LIMIT-NEXT: br label [[BB9:%.*]] -; LARGER-LIMIT: bb8: -; LARGER-LIMIT-NEXT: br label [[BB9]] -; LARGER-LIMIT: bb9: -; LARGER-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 -; LARGER-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 -; LARGER-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 -; LARGER-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 -; LARGER-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 -; LARGER-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 -; LARGER-LIMIT-NEXT: ret void +; CHECK-LABEL: @overlap1( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; CHECK: bb7: +; CHECK-NEXT: br label [[BB9:%.*]] +; CHECK: bb8: +; CHECK-NEXT: br label [[BB9]] +; CHECK: bb9: +; CHECK-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; CHECK-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; CHECK-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; CHECK-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; CHECK-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; CHECK-NEXT: ret void ; bb: %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2