Index: lib/Transforms/Scalar/DeadStoreElimination.cpp =================================================================== --- lib/Transforms/Scalar/DeadStoreElimination.cpp +++ lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -278,9 +278,10 @@ default: return false; case Intrinsic::memset: case Intrinsic::memcpy: + case Intrinsic::memcpy_element_unordered_atomic: + case Intrinsic::memset_element_unordered_atomic: // Do shorten memory intrinsics. // FIXME: Add memmove if it's also safe to transform. - // TODO: Add atomic memcpy/memset return true; } } @@ -295,9 +296,7 @@ static bool isShortenableAtTheBeginning(Instruction *I) { // FIXME: Handle only memset for now. Supporting memcpy/memmove should be // easily done by offsetting the source address. - // TODO: Handle atomic memory intrinsics - IntrinsicInst *II = dyn_cast(I); - return II && II->getIntrinsicID() == Intrinsic::memset; + return isa(I); } /// Return the pointer that is being written to. @@ -897,7 +896,7 @@ // Power of 2 vector writes are probably always a bad idea to optimize // as any store/memset/memcpy is likely using vector instructions so // shortening it to not vector size is likely to be slower - MemIntrinsic *EarlierIntrinsic = cast(EarlierWrite); + auto *EarlierIntrinsic = cast(EarlierWrite); unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment(); if (!IsOverwriteEnd) LaterOffset = int64_t(LaterOffset + LaterSize); @@ -906,15 +905,23 @@ !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0)) return false; + int64_t NewLength = IsOverwriteEnd + ? LaterOffset - EarlierOffset + : EarlierSize - (LaterOffset - EarlierOffset); + + if (auto *AMI = dyn_cast(EarlierWrite)) { + // When shortening an atomic memory intrinsic, the newly shortened + // length must remain an integer multiple of the element size. + const uint32_t ElementSize = AMI->getElementSizeInBytes(); + if (0 != NewLength % ElementSize) + return false; + } + DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize << ")\n"); - int64_t NewLength = IsOverwriteEnd - ? LaterOffset - EarlierOffset - : EarlierSize - (LaterOffset - EarlierOffset); - Value *EarlierWriteLength = EarlierIntrinsic->getLength(); Value *TrimmedLength = ConstantInt::get(EarlierWriteLength->getType(), NewLength); Index: test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll =================================================================== --- test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll +++ test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll @@ -26,7 +26,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 ; CHECK-NEXT: ret void @@ -60,7 +61,8 @@ ; CHECK-LABEL: @write0to3_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: store atomic i32 1, i32* [[P]] unordered, align 4 ; CHECK-NEXT: ret void ; @@ -76,7 +78,8 @@ ; CHECK-LABEL: @write0to3_atomic_weaker( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: store i32 1, i32* [[P]], align 4 ; CHECK-NEXT: ret void ; @@ -111,7 +114,8 @@ ; CHECK-LABEL: @write0to7_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 8 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* ; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8 ; CHECK-NEXT: ret void @@ -149,7 +153,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64* ; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8 ; CHECK-NEXT: ret void @@ -307,7 +312,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 ; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8 @@ -333,7 +339,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 ; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8 @@ -359,7 +366,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16 +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1 ; CHECK-NEXT: store i64 1, i64* [[BASE64_1]], align 8 Index: test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll =================================================================== --- test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll +++ test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll @@ -32,7 +32,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 ; CHECK-NEXT: ret void @@ -52,7 +52,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 ; CHECK-NEXT: ret void @@ -87,7 +87,7 @@ ; CHECK-LABEL: @write28to32_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4 ; CHECK-NEXT: ret void @@ -155,7 +155,7 @@ ; CHECK-LABEL: @write32to36_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4) ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 ; CHECK-NEXT: store atomic i32 1, i32* [[C]] unordered, align 4 ; CHECK-NEXT: ret void @@ -173,7 +173,7 @@ ; CHECK-LABEL: @write32to36_atomic_weaker( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8* -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4) ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2 ; CHECK-NEXT: store i32 1, i32* [[C]], align 4 ; CHECK-NEXT: ret void @@ -207,7 +207,7 @@ ; CHECK-LABEL: @write16to32_atomic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8* -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4) +; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 16, i32 4) ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 1 ; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[C]], align 4 ; CHECK-NEXT: ret void @@ -316,7 +316,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 ; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8 @@ -342,7 +342,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 ; CHECK-NEXT: store i64 3, i64* [[BASE64_2]], align 8 @@ -368,7 +368,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8* ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0 -; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8) +; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3 ; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8