Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -136,6 +136,14 @@ if (Size > 8 || (Size&(Size-1))) return nullptr; // If not 1/2/4/8 bytes, exit. + // If it is an atomic and alignment is less than the size then we will + // introduce the unaligned memory access which will be later transformed + // into libcall in CodeGen. This is not evident performance gain so disable + // it now. + if (isa(MI)) + if (CopyDstAlign < Size || CopySrcAlign < Size) + return nullptr; + // Use an integer load+store unless we can find something better. unsigned SrcAddrSp = cast(MI->getArgOperand(1)->getType())->getAddressSpace(); @@ -220,6 +228,18 @@ Alignment = MI->getDestAlignment(); assert(Len && "0-sized memory setting should be removed already."); + // Alignment 0 is identity for alignment 1 for memset, but not store. + if (Alignment == 0) + Alignment = 1; + + // If it is an atomic and alignment is less than the size then we will + // introduce the unaligned memory access which will be later transformed + // into libcall in CodeGen. This is not evident performance gain so disable + // it now. + if (isa(MI)) + if (Alignment < Len) + return nullptr; + // memset(s,c,n) -> store s, c (for n=1,2,4,8) if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. @@ -229,9 +249,6 @@ Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); - // Alignment 0 is identity for alignment 1 for memset, but not store. - if (Alignment == 0) Alignment = 1; - // Extract the fill value and store. uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, Index: test/Transforms/InstCombine/element-atomic-memintrins.ll =================================================================== --- test/Transforms/InstCombine/element-atomic-memintrins.ll +++ test/Transforms/InstCombine/element-atomic-memintrins.ll @@ -15,12 +15,9 @@ define void @test_memset_to_store(i8* %dest) { ; CHECK-LABEL: @test_memset_to_store( ; CHECK-NEXT: store atomic i8 1, i8* [[DEST:%.*]] unordered, align 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[DEST]] to i16* -; CHECK-NEXT: store atomic i16 257, i16* [[TMP1]] unordered, align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST]] to i32* -; CHECK-NEXT: store atomic i32 16843009, i32* [[TMP2]] unordered, align 1 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: store atomic i64 72340172838076673, i64* [[TMP3]] unordered, align 1 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 2, i32 1) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 4, i32 1) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 8, i32 1) ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 16, i32 1) ; CHECK-NEXT: ret void ; @@ -36,10 +33,8 @@ ; CHECK-LABEL: @test_memset_to_store_2( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[DEST:%.*]] to i16* ; CHECK-NEXT: store atomic i16 257, i16* [[TMP1]] unordered, align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST]] to i32* -; CHECK-NEXT: store atomic i32 16843009, i32* [[TMP2]] unordered, align 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: store atomic i64 72340172838076673, i64* [[TMP3]] unordered, align 2 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 2 [[DEST]], i8 1, i32 4, i32 2) +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 2 [[DEST]], i8 1, i32 8, i32 2) ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 2 [[DEST]], i8 1, i32 16, i32 2) ; CHECK-NEXT: ret void ; @@ -54,8 +49,7 @@ ; CHECK-LABEL: @test_memset_to_store_4( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[DEST:%.*]] to i32* ; CHECK-NEXT: store atomic i32 16843009, i32* [[TMP1]] unordered, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: store atomic i64 72340172838076673, i64* [[TMP2]] unordered, align 4 +; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 [[DEST]], i8 1, i32 8, i32 4) ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 [[DEST]], i8 1, i32 16, i32 4) ; CHECK-NEXT: ret void ; @@ -134,18 +128,9 @@ ; CHECK-LABEL: @test_memmove_loadstore( ; CHECK-NEXT: [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1 ; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16* -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16* -; CHECK-NEXT: [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1 -; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32* -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32* -; CHECK-NEXT: [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1 -; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64* -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1 -; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 2, i32 1) +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 4, i32 1) +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 8, i32 1) ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1) ; CHECK-NEXT: ret void ; @@ -163,14 +148,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16* ; CHECK-NEXT: [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2 ; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32* -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32* -; CHECK-NEXT: [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2 -; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64* -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2 -; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 4, i32 2) +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 8, i32 2) ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2) ; CHECK-NEXT: ret void ; @@ -187,10 +166,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32* ; CHECK-NEXT: [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64* -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4 -; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4 +; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 8, i32 4) ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4) ; CHECK-NEXT: ret void ; @@ -258,18 +234,9 @@ ; CHECK-LABEL: @test_memcpy_loadstore( ; CHECK-NEXT: [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1 ; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16* -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16* -; CHECK-NEXT: [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1 -; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32* -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32* -; CHECK-NEXT: [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1 -; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64* -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1 -; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 2, i32 1) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 4, i32 1) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 8, i32 1) ; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1) ; CHECK-NEXT: ret void ; @@ -287,14 +254,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16* ; CHECK-NEXT: [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2 ; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32* -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32* -; CHECK-NEXT: [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2 -; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64* -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2 -; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 4, i32 2) +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 8, i32 2) ; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2) ; CHECK-NEXT: ret void ; @@ -311,10 +272,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32* ; CHECK-NEXT: [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64* -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64* -; CHECK-NEXT: [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4 -; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4 +; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 8, i32 4) ; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4) ; CHECK-NEXT: ret void ;