diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2220,17 +2220,17 @@ } // Align the pointer arguments to this call if the target thinks it's a good - // idea - unsigned MinSize; - Align PrefAlign; - if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { - for (auto &Arg : CI->args()) { - // We want to align both objects whose address is used directly and - // objects whose address is used in casts and GEPs, though it only makes - // sense for GEPs if the offset is a multiple of the desired alignment and - // if size - offset meets the size threshold. - if (!Arg->getType()->isPointerTy()) - continue; + // idea (generally only useful for memcpy/memmove/memset). + for (auto &Arg : CI->args()) { + // We want to align both objects whose address is used directly and + // objects whose address is used in casts and GEPs, though it only makes + // sense for GEPs if the offset is a multiple of the desired alignment and + // if size - offset meets the size threshold. + if (!Arg->getType()->isPointerTy()) + continue; + unsigned MinSize; + Align PrefAlign; + if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { APInt Offset(DL->getIndexSizeInBits( cast(Arg->getType())->getAddressSpace()), 0); diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -19,16 +19,16 @@ ; GCN-LABEL: test: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: ds_read_u8 v1, v0 offset:1 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: ds_write_b8 v0, v1 ; GCN-NEXT: ds_read_u8 v2, v0 offset:2 +; GCN-NEXT: ds_read_u16 v3, v0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v3, 2 -; GCN-NEXT: ds_write_b8 v0, v3 -; GCN-NEXT: ds_write_b8 v0, v3 offset:4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b8 v0, v1 offset:5 ; GCN-NEXT: ds_write_b8 v0, v2 offset:6 -; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: ds_write_b16 v0, v3 offset:4 +; GCN-NEXT: v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GCN-NEXT: global_store_byte v0, v1, s[0:1] ; GCN-NEXT: s_endpgm ; CHECK-LABEL: @test( @@ -53,7 +53,6 @@ ; CHECK-NEXT: [[FROMBOOL8:%.*]] = zext i1 [[TMP10]] to i8 ; CHECK-NEXT: store i8 [[FROMBOOL8]], i8 addrspace(1)* [[PTR_COERCE:%.*]], align 1 ; CHECK-NEXT: ret void -; entry: store i8 3, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1 tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false) diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll --- a/llvm/test/CodeGen/X86/mcu-abi.ll +++ b/llvm/test/CodeGen/X86/mcu-abi.ll @@ -64,13 +64,14 @@ define void @ret_large_struct(ptr noalias nocapture sret(%struct.st12_t) %agg.result, ptr byval(%struct.st12_t) nocapture readonly align 4 %r) #0 { ; CHECK-LABEL: ret_large_struct: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl %eax, %esi -; CHECK-NEXT: leal {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl $48, %ecx -; CHECK-NEXT: calll memcpy -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl $12, %ecx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: rep;movsl (%esi), %es:(%edi) ; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi ; CHECK-NEXT: retl entry: call void @llvm.memcpy.p0.p0.i32(ptr %agg.result, ptr %r, i32 48, i1 false) diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll --- a/llvm/test/CodeGen/X86/memset-2.ll +++ b/llvm/test/CodeGen/X86/memset-2.ll @@ -1,31 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s -define fastcc void @t1() nounwind { +define fastcc void @t1(ptr nocapture %s) nounwind { ; CHECK-LABEL: t1: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: pushl $188 ; CHECK-NEXT: pushl $0 -; CHECK-NEXT: pushl $0 +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: ud2 entry: - call void @llvm.memset.p0.i32(ptr null, i8 0, i32 188, i1 false) + call void @llvm.memset.p0.i32(ptr %s, i8 0, i32 188, i1 false) unreachable } -define fastcc void @t2(i8 signext %c) nounwind { +define fastcc void @t2(ptr nocapture %s, i8 signext %c) nounwind { ; CHECK-LABEL: t2: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: subl $12, %esp -; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp) +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: pushl $76 +; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: calll _memset +; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: ud2 entry: - call void @llvm.memset.p0.i32(ptr undef, i8 %c, i32 76, i1 false) + call void @llvm.memset.p0.i32(ptr %s, i8 %c, i32 76, i1 false) unreachable } diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll --- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll @@ -3,55 +3,57 @@ ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=SLOW_32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=SLOW_64 -define void @bork() nounwind { +define void @bork(ptr nocapture align 4 %dst) nounwind { ; FAST-LABEL: bork: ; FAST: # %bb.0: +; FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; FAST-NEXT: xorps %xmm0, %xmm0 -; FAST-NEXT: movups %xmm0, 64 -; FAST-NEXT: movups %xmm0, 48 -; FAST-NEXT: movups %xmm0, 32 -; FAST-NEXT: movups %xmm0, 16 -; FAST-NEXT: movups %xmm0, 0 +; FAST-NEXT: movups %xmm0, 64(%eax) +; FAST-NEXT: movups %xmm0, 48(%eax) +; FAST-NEXT: movups %xmm0, 32(%eax) +; FAST-NEXT: movups %xmm0, 16(%eax) +; FAST-NEXT: movups %xmm0, (%eax) ; FAST-NEXT: retl ; ; SLOW_32-LABEL: bork: ; SLOW_32: # %bb.0: -; SLOW_32-NEXT: movl $0, 4 -; SLOW_32-NEXT: movl $0, 0 -; SLOW_32-NEXT: movl $0, 12 -; SLOW_32-NEXT: movl $0, 8 -; SLOW_32-NEXT: movl $0, 20 -; SLOW_32-NEXT: movl $0, 16 -; SLOW_32-NEXT: movl $0, 28 -; SLOW_32-NEXT: movl $0, 24 -; SLOW_32-NEXT: movl $0, 36 -; SLOW_32-NEXT: movl $0, 32 -; SLOW_32-NEXT: movl $0, 44 -; SLOW_32-NEXT: movl $0, 40 -; SLOW_32-NEXT: movl $0, 52 -; SLOW_32-NEXT: movl $0, 48 -; SLOW_32-NEXT: movl $0, 60 -; SLOW_32-NEXT: movl $0, 56 -; SLOW_32-NEXT: movl $0, 68 -; SLOW_32-NEXT: movl $0, 64 -; SLOW_32-NEXT: movl $0, 76 -; SLOW_32-NEXT: movl $0, 72 +; SLOW_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SLOW_32-NEXT: movl $0, 4(%eax) +; SLOW_32-NEXT: movl $0, (%eax) +; SLOW_32-NEXT: movl $0, 12(%eax) +; SLOW_32-NEXT: movl $0, 8(%eax) +; SLOW_32-NEXT: movl $0, 20(%eax) +; SLOW_32-NEXT: movl $0, 16(%eax) +; SLOW_32-NEXT: movl $0, 28(%eax) +; SLOW_32-NEXT: movl $0, 24(%eax) +; SLOW_32-NEXT: movl $0, 36(%eax) +; SLOW_32-NEXT: movl $0, 32(%eax) +; SLOW_32-NEXT: movl $0, 44(%eax) +; SLOW_32-NEXT: movl $0, 40(%eax) +; SLOW_32-NEXT: movl $0, 52(%eax) +; SLOW_32-NEXT: movl $0, 48(%eax) +; SLOW_32-NEXT: movl $0, 60(%eax) +; SLOW_32-NEXT: movl $0, 56(%eax) +; SLOW_32-NEXT: movl $0, 68(%eax) +; SLOW_32-NEXT: movl $0, 64(%eax) +; SLOW_32-NEXT: movl $0, 76(%eax) +; SLOW_32-NEXT: movl $0, 72(%eax) ; SLOW_32-NEXT: retl ; ; SLOW_64-LABEL: bork: ; SLOW_64: # %bb.0: -; SLOW_64-NEXT: movq $0, 72 -; SLOW_64-NEXT: movq $0, 64 -; SLOW_64-NEXT: movq $0, 56 -; SLOW_64-NEXT: movq $0, 48 -; SLOW_64-NEXT: movq $0, 40 -; SLOW_64-NEXT: movq $0, 32 -; SLOW_64-NEXT: movq $0, 24 -; SLOW_64-NEXT: movq $0, 16 -; SLOW_64-NEXT: movq $0, 8 -; SLOW_64-NEXT: movq $0, 0 +; SLOW_64-NEXT: movq $0, 72(%rdi) +; SLOW_64-NEXT: movq $0, 64(%rdi) +; SLOW_64-NEXT: movq $0, 56(%rdi) +; SLOW_64-NEXT: movq $0, 48(%rdi) +; SLOW_64-NEXT: movq $0, 40(%rdi) +; SLOW_64-NEXT: movq $0, 32(%rdi) +; SLOW_64-NEXT: movq $0, 24(%rdi) +; SLOW_64-NEXT: movq $0, 16(%rdi) +; SLOW_64-NEXT: movq $0, 8(%rdi) +; SLOW_64-NEXT: movq $0, (%rdi) ; SLOW_64-NEXT: retq - call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 80, i1 false) + call void @llvm.memset.p0.i64(ptr align 4 %dst, i8 0, i64 80, i1 false) ret void } diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll b/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll --- a/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes ; RUN: opt -S -mtriple=x86_64 -disable-simplify-libcalls -codegenprepare < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @@ -10,8 +10,9 @@ ; - TLI::has (always returns false thanks to -disable-simplify-libcalls) define void @test_nobuiltin(i8* %dst, i64 %len) { -; CHECK-LABEL: @test_nobuiltin( -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) #1 +; CHECK-LABEL: define {{[^@]+}}@test_nobuiltin +; CHECK-SAME: (i8* [[DST:%.*]], i64 [[LEN:%.*]]) { +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 0, i64 [[LEN]], i1 false) ; CHECK-NEXT: ret void ; call i8* @__memset_chk(i8* %dst, i32 0, i64 %len, i64 -1) nobuiltin