diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2220,10 +2220,7 @@ } // Align the pointer arguments to this call if the target thinks it's a good - // idea - unsigned MinSize; - Align PrefAlign; - if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { + // idea (generally only useful for memcpy/memmove/memset). for (auto &Arg : CI->args()) { // We want to align both objects whose address is used directly and // objects whose address is used in casts and GEPs, though it only makes @@ -2231,6 +2228,9 @@ // if size - offset meets the size threshold. if (!Arg->getType()->isPointerTy()) continue; + unsigned MinSize; + Align PrefAlign; + if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) { APInt Offset(DL->getIndexSizeInBits( cast(Arg->getType())->getAddressSpace()), 0); diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --force-update ; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s @@ -20,16 +19,16 @@ ; GCN-LABEL: test: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: ds_read_u8 v1, v0 offset:1 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: ds_write_b8 v0, v1 ; GCN-NEXT: ds_read_u8 v2, v0 offset:2 +; GCN-NEXT: ds_read_u16 v3, v0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v3, 2 -; GCN-NEXT: ds_write_b8 v0, v3 -; GCN-NEXT: ds_write_b8 v0, v3 offset:4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_write_b8 v0, v1 offset:5 ; GCN-NEXT: ds_write_b8 v0, v2 offset:6 -; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: ds_write_b16 v0, v3 offset:4 +; GCN-NEXT: v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GCN-NEXT: global_store_byte v0, v1, s[0:1] ; GCN-NEXT: s_endpgm ; CHECK-LABEL: @test( @@ -54,7 +53,6 @@ ; CHECK-NEXT: [[FROMBOOL8:%.*]] = zext i1 [[TMP10]] to i8 ; CHECK-NEXT: store i8 [[FROMBOOL8]], i8 addrspace(1)* [[PTR_COERCE:%.*]], align 1 ; CHECK-NEXT: ret void -; entry: store i8 3, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1 tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false) diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll --- a/llvm/test/CodeGen/X86/mcu-abi.ll +++ b/llvm/test/CodeGen/X86/mcu-abi.ll @@ -64,13 +64,14 @@ define void @ret_large_struct(ptr noalias nocapture sret(%struct.st12_t) %agg.result, ptr byval(%struct.st12_t) nocapture readonly align 4 %r) #0 { ; CHECK-LABEL: ret_large_struct: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl %eax, %esi -; CHECK-NEXT: leal {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl $48, %ecx -; CHECK-NEXT: calll memcpy -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl $12, %ecx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: rep;movsl (%esi), %es:(%edi) ; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi ; CHECK-NEXT: retl entry: call void @llvm.memcpy.p0.p0.i32(ptr %agg.result, ptr %r, i32 48, i1 false) diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll --- a/llvm/test/CodeGen/X86/memset-2.ll +++ b/llvm/test/CodeGen/X86/memset-2.ll @@ -4,12 +4,21 @@ define fastcc void @t1() nounwind { ; CHECK-LABEL: t1: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: subl $16, %esp -; CHECK-NEXT: pushl $188 -; CHECK-NEXT: pushl $0 -; CHECK-NEXT: pushl $0 -; CHECK-NEXT: calll _memset -; CHECK-NEXT: addl $16, %esp +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, 160 +; CHECK-NEXT: movaps %xmm0, 144 +; CHECK-NEXT: movaps %xmm0, 128 +; CHECK-NEXT: movaps %xmm0, 112 +; CHECK-NEXT: movaps %xmm0, 96 +; CHECK-NEXT: movaps %xmm0, 80 +; CHECK-NEXT: movaps %xmm0, 64 +; CHECK-NEXT: movaps %xmm0, 48 +; CHECK-NEXT: movaps %xmm0, 32 +; CHECK-NEXT: movaps %xmm0, 16 +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: movl $0, 180 +; CHECK-NEXT: movl $0, 176 +; CHECK-NEXT: movl $0, 184 ; CHECK-NEXT: ud2 entry: call void @llvm.memset.p0.i32(ptr null, i8 0, i32 188, i1 false) diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll --- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll @@ -7,49 +7,31 @@ ; FAST-LABEL: bork: ; FAST: # %bb.0: ; FAST-NEXT: xorps %xmm0, %xmm0 -; FAST-NEXT: movups %xmm0, 64 -; FAST-NEXT: movups %xmm0, 48 -; FAST-NEXT: movups %xmm0, 32 -; FAST-NEXT: movups %xmm0, 16 -; FAST-NEXT: movups %xmm0, 0 +; FAST-NEXT: movaps %xmm0, 64 +; FAST-NEXT: movaps %xmm0, 48 +; FAST-NEXT: movaps %xmm0, 32 +; FAST-NEXT: movaps %xmm0, 16 +; FAST-NEXT: movaps %xmm0, 0 ; FAST-NEXT: retl ; ; SLOW_32-LABEL: bork: ; SLOW_32: # %bb.0: -; SLOW_32-NEXT: movl $0, 4 -; SLOW_32-NEXT: movl $0, 0 -; SLOW_32-NEXT: movl $0, 12 -; SLOW_32-NEXT: movl $0, 8 -; SLOW_32-NEXT: movl $0, 20 -; SLOW_32-NEXT: movl $0, 16 -; SLOW_32-NEXT: movl $0, 28 -; SLOW_32-NEXT: movl $0, 24 -; SLOW_32-NEXT: movl $0, 36 -; SLOW_32-NEXT: movl $0, 32 -; SLOW_32-NEXT: movl $0, 44 -; SLOW_32-NEXT: movl $0, 40 -; SLOW_32-NEXT: movl $0, 52 -; SLOW_32-NEXT: movl $0, 48 -; SLOW_32-NEXT: movl $0, 60 -; SLOW_32-NEXT: movl $0, 56 -; SLOW_32-NEXT: movl $0, 68 -; SLOW_32-NEXT: movl $0, 64 -; SLOW_32-NEXT: movl $0, 76 -; SLOW_32-NEXT: movl $0, 72 +; SLOW_32-NEXT: xorps %xmm0, %xmm0 +; SLOW_32-NEXT: movaps %xmm0, 64 +; SLOW_32-NEXT: movaps %xmm0, 48 +; SLOW_32-NEXT: movaps %xmm0, 32 +; SLOW_32-NEXT: movaps %xmm0, 16 +; SLOW_32-NEXT: movaps %xmm0, 0 ; SLOW_32-NEXT: retl ; ; SLOW_64-LABEL: bork: ; SLOW_64: # %bb.0: -; SLOW_64-NEXT: movq $0, 72 -; SLOW_64-NEXT: movq $0, 64 -; SLOW_64-NEXT: movq $0, 56 -; SLOW_64-NEXT: movq $0, 48 -; SLOW_64-NEXT: movq $0, 40 -; SLOW_64-NEXT: movq $0, 32 -; SLOW_64-NEXT: movq $0, 24 -; SLOW_64-NEXT: movq $0, 16 -; SLOW_64-NEXT: movq $0, 8 -; SLOW_64-NEXT: movq $0, 0 +; SLOW_64-NEXT: xorps %xmm0, %xmm0 +; SLOW_64-NEXT: movaps %xmm0, 64 +; SLOW_64-NEXT: movaps %xmm0, 48 +; SLOW_64-NEXT: movaps %xmm0, 32 +; SLOW_64-NEXT: movaps %xmm0, 16 +; SLOW_64-NEXT: movups %xmm0, 0 ; SLOW_64-NEXT: retq call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 80, i1 false) ret void diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll b/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll --- a/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/memset_chk-simplify-nobuiltin.ll @@ -11,7 +11,7 @@ define void @test_nobuiltin(i8* %dst, i64 %len) { ; CHECK-LABEL: @test_nobuiltin( -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[DST:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) #1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) #[[ATTR1:[0-9]+]] ; CHECK-NEXT: ret void ; call i8* @__memset_chk(i8* %dst, i32 0, i64 %len, i64 -1) nobuiltin