Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -106,21 +106,31 @@ } // end anonymous namespace bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { + // Count the number of non-undef stores + size_t non_undef_stores = 0; + bool no_undef = true; + for (Instruction *SI : TheStores) { + if (auto *Store = dyn_cast(SI)) { + Value *StoredVal = Store->getValueOperand(); + if (!isa(StoredVal)) + non_undef_stores++; + else + no_undef = false; + } else { + // If any of the stores are a memset, then it is always good to extend the + // memset. + return true; + } + } // If we found more than 4 stores to merge or 16 bytes, use memset. - if (TheStores.size() >= 4 || End-Start >= 16) return true; + if (non_undef_stores >= 4 || (no_undef && End - Start >= 16)) + return true; // If there is nothing to merge, don't do anything. - if (TheStores.size() < 2) return false; - - // If any of the stores are a memset, then it is always good to extend the - // memset. - for (Instruction *SI : TheStores) - if (!isa(SI)) - return true; - // Assume that the code generator is capable of merging pairs of stores // together if it wants to. - if (TheStores.size() == 2) return false; + if (non_undef_stores <= 2) + return false; // If we have fewer than 8 stores, it can still be worthwhile to do this. // For example, merging 4 i8 stores into an i32 store is useful almost always. @@ -144,7 +154,7 @@ // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 // etc. - return TheStores.size() > NumPointerStores+NumByteStores; + return non_undef_stores > NumPointerStores + NumByteStores; } namespace { @@ -456,8 +466,15 @@ // Check to see if this stored value is of the same byte-splattable value. Value *StoredByte = isBytewiseValue(StoredVal, DL); + // If the previously stored value is undef, we can replace it with the + // newly stored value (which may be undef) + // Inversely, if the newly stored value is undef, we can replace it with + // the previously stored one if (isa(ByteVal) && StoredByte) ByteVal = StoredByte; + else if (StoredByte && isa(StoredByte)) + StoredByte = ByteVal; + if (ByteVal != StoredByte) break; Index: llvm/test/Transforms/MemCpyOpt/merge-undef-memset.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/MemCpyOpt/merge-undef-memset.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s + +%S = type { { i8, i8 }, { i8, i8 }, { i8, i8 }, { i8, i8 }, { i8, i8 }, { i8, i8 }, { i8, i8 }, { i8, i8 } } + +define void @alternating(ptr %arg) { +; CHECK-LABEL: @alternating( +; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[ARG:%.*]], i32 0, i32 1 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds [[S:%.*]], ptr [[ARG]], i32 0, i32 1 +; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I1]], i32 0, i32 1 +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 2 +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I3]], i32 0, i32 1 +; CHECK-NEXT: [[I5:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 3 +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I5]], i32 0, i32 1 +; CHECK-NEXT: [[I7:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 4 +; CHECK-NEXT: [[I8:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I7]], i32 0, i32 1 +; CHECK-NEXT: [[I9:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 5 +; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I9]], i32 0, i32 1 +; CHECK-NEXT: [[I11:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 6 +; CHECK-NEXT: [[I12:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I11]], i32 0, i32 1 +; CHECK-NEXT: [[I13:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 7 +; CHECK-NEXT: [[I14:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I13]], i32 0, i32 1 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[ARG]], i8 0, i64 16, i1 false) +; CHECK-NEXT: ret void +; + store i8 0, ptr %arg, align 1 + %i = getelementptr inbounds { i8, i8 }, ptr %arg, i32 0, i32 1 + store i8 undef, ptr %i, align 1 + %i1 = getelementptr inbounds %S, ptr %arg, i32 0, i32 1 + store i8 0, ptr %i1, align 1 + %i2 = getelementptr inbounds { i8, i8 }, ptr %i1, i32 0, i32 1 + store i8 undef, ptr %i2, align 1 + %i3 = getelementptr inbounds %S, ptr %arg, i32 0, i32 2 + store i8 0, ptr %i3, align 1 + %i4 = getelementptr inbounds { i8, i8 }, ptr %i3, i32 0, i32 1 + store i8 undef, ptr %i4, align 1 + %i5 = getelementptr inbounds %S, ptr %arg, i32 0, i32 3 + store i8 0, ptr %i5, align 1 + %i6 = getelementptr inbounds { i8, i8 }, ptr %i5, i32 0, i32 1 + store i8 undef, ptr %i6, align 1 + %i7 = getelementptr inbounds %S, ptr %arg, i32 0, i32 4 + store i8 0, ptr %i7, align 1 + %i8 = getelementptr inbounds { i8, i8 }, ptr %i7, i32 0, i32 1 + store i8 undef, ptr %i8, align 1 + %i9 = getelementptr inbounds %S, ptr %arg, i32 0, i32 5 + store i8 0, ptr %i9, align 1 + %i10 = getelementptr inbounds { i8, i8 }, ptr %i9, i32 0, i32 1 + store i8 undef, ptr %i10, align 1 + %i11 = getelementptr inbounds %S, ptr %arg, i32 0, i32 6 + store i8 0, ptr %i11, align 1 + %i12 = getelementptr inbounds { i8, i8 }, ptr %i11, i32 0, i32 1 + store i8 undef, ptr %i12, align 1 + %i13 = getelementptr inbounds %S, ptr %arg, i32 0, i32 7 + store i8 0, ptr %i13, align 1 + %i14 = getelementptr inbounds { i8, i8 }, ptr %i13, i32 0, i32 1 + store i8 undef, ptr %i14, align 1 + ret void +} + +define void @mostly_undef(ptr %arg) { +; CHECK-LABEL: @mostly_undef( +; CHECK-NEXT: store i8 0, ptr [[ARG:%.*]], align 1 +; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[ARG]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I]], align 1 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds [[S:%.*]], ptr [[ARG]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I1]], align 1 +; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I1]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I2]], align 1 +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 2 +; CHECK-NEXT: store i8 undef, ptr [[I3]], align 1 +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I3]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I4]], align 1 +; CHECK-NEXT: [[I5:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 3 +; CHECK-NEXT: store i8 undef, ptr [[I5]], align 1 +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I5]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I6]], align 1 +; CHECK-NEXT: [[I7:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 4 +; CHECK-NEXT: store i8 undef, ptr [[I7]], align 1 +; CHECK-NEXT: [[I8:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I7]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I8]], align 1 +; CHECK-NEXT: [[I9:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 5 +; CHECK-NEXT: store i8 undef, ptr [[I9]], align 1 +; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I9]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I10]], align 1 +; CHECK-NEXT: [[I11:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 6 +; CHECK-NEXT: store i8 undef, ptr [[I11]], align 1 +; CHECK-NEXT: [[I12:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I11]], i32 0, i32 1 +; CHECK-NEXT: store i8 undef, ptr [[I12]], align 1 +; CHECK-NEXT: [[I13:%.*]] = getelementptr inbounds [[S]], ptr [[ARG]], i32 0, i32 7 +; CHECK-NEXT: store i8 undef, ptr [[I13]], align 1 +; CHECK-NEXT: [[I14:%.*]] = getelementptr inbounds { i8, i8 }, ptr [[I13]], i32 0, i32 1 +; CHECK-NEXT: store i8 0, ptr [[I14]], align 1 +; CHECK-NEXT: ret void +; + store i8 0, ptr %arg, align 1 + %i = getelementptr inbounds { i8, i8 }, ptr %arg, i32 0, i32 1 + store i8 undef, ptr %i, align 1 + %i1 = getelementptr inbounds %S, ptr %arg, i32 0, i32 1 + store i8 undef, ptr %i1, align 1 + %i2 = getelementptr inbounds { i8, i8 }, ptr %i1, i32 0, i32 1 + store i8 undef, ptr %i2, align 1 + %i3 = getelementptr inbounds %S, ptr %arg, i32 0, i32 2 + store i8 undef, ptr %i3, align 1 + %i4 = getelementptr inbounds { i8, i8 }, ptr %i3, i32 0, i32 1 + store i8 undef, ptr %i4, align 1 + %i5 = getelementptr inbounds %S, ptr %arg, i32 0, i32 3 + store i8 undef, ptr %i5, align 1 + %i6 = getelementptr inbounds { i8, i8 }, ptr %i5, i32 0, i32 1 + store i8 undef, ptr %i6, align 1 + %i7 = getelementptr inbounds %S, ptr %arg, i32 0, i32 4 + store i8 undef, ptr %i7, align 1 + %i8 = getelementptr inbounds { i8, i8 }, ptr %i7, i32 0, i32 1 + store i8 undef, ptr %i8, align 1 + %i9 = getelementptr inbounds %S, ptr %arg, i32 0, i32 5 + store i8 undef, ptr %i9, align 1 + %i10 = getelementptr inbounds { i8, i8 }, ptr %i9, i32 0, i32 1 + store i8 undef, ptr %i10, align 1 + %i11 = getelementptr inbounds %S, ptr %arg, i32 0, i32 6 + store i8 undef, ptr %i11, align 1 + %i12 = getelementptr inbounds { i8, i8 }, ptr %i11, i32 0, i32 1 + store i8 undef, ptr %i12, align 1 + %i13 = getelementptr inbounds %S, ptr %arg, i32 0, i32 7 + store i8 undef, ptr %i13, align 1 + %i14 = getelementptr inbounds { i8, i8 }, ptr %i13, i32 0, i32 1 + store i8 0, ptr %i14, align 1 + ret void +} + +declare void @llvm.memset.p0i8.i64(ptr, i8, i64, i1) +