Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -2010,8 +2010,7 @@ bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); - if ((!IsMemset || ZeroMemset) && - !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && Index: test/CodeGen/X86/mem-intrin-base-reg.ll =================================================================== --- test/CodeGen/X86/mem-intrin-base-reg.ll +++ test/CodeGen/X86/mem-intrin-base-reg.ll @@ -91,10 +91,39 @@ ; CHECK-LABEL: _memset_vla_vector: ; CHECK: andl $-16, %esp ; CHECK: movl %esp, %esi -; CHECK-DAG: movl $707406378, %eax # imm = 0x2A2A2A2A -; CHECK-DAG: movl $32, %ecx -; CHECK-DAG: movl {{.*}}, %edi ; CHECK-NOT: movl {{.*}}, %esi -; CHECK: rep;stosl +; CHECK-DAG: movl 12(%ebp), %ecx +; CHECK-DAG: movl $707406378, 4(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, (%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 12(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 8(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 20(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 16(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 28(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 24(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 36(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 32(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 44(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 40(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 52(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 48(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 60(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 56(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 68(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 64(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 76(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 72(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 84(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 80(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 92(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 88(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 100(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 96(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 108(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 104(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 116(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 112(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 124(%ecx) # imm = 0x2A2A2A2A +; CHECK-DAG: movl $707406378, 120(%ecx) # imm = 0x2A2A2A2A ; Add a test for memcmp if we ever add a special lowering for it. Index: test/CodeGen/X86/memset-2.ll =================================================================== --- test/CodeGen/X86/memset-2.ll +++ test/CodeGen/X86/memset-2.ll @@ -18,10 +18,16 @@ define fastcc void @t2(i8 signext %c) nounwind { ; CHECK-LABEL: t2: -; CHECK: subl $12, %esp -; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp) -; CHECK-NEXT: calll L_memset$stub +; CHECK: movzbl %cl, %eax +; CHECK-NEXT: imull $16843009, %eax, %ecx ## imm = 0x1010101 +; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movl $16843009, %edx ## imm = 0x1010101 +; CHECK-NEXT: mull %edx +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, (%eax) ; entry: call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false) @@ -32,11 +38,19 @@ define void @t3(i8* nocapture %s, i8 %a) nounwind { ; CHECK-LABEL: t3: -; CHECK: movl {{[0-9]+}}(%esp), %eax +; CHECK: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101 -; CHECK-NEXT: movl %ecx, 4(%eax) -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: movl $16843009, %edx ## imm = 0x1010101 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: mull %edx +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, (%esi) +; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl ; entry: @@ -46,14 +60,20 @@ define void @t4(i8* nocapture %s, i8 %a) nounwind { ; CHECK-LABEL: t4: -; CHECK: movl {{[0-9]+}}(%esp), %eax +; CHECK: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101 -; CHECK-NEXT: movl %ecx, 8(%eax) -; CHECK-NEXT: movl %ecx, 4(%eax) -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: movw %cx, 12(%eax) -; CHECK-NEXT: movb %cl, 14(%eax) +; CHECK-NEXT: movl $16843009, %edx ## imm = 0x1010101 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: mull %edx +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, 7(%esi) +; CHECK-NEXT: movq %xmm0, (%esi) +; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl ; entry: Index: test/CodeGen/X86/memset-nonzero.ll =================================================================== --- test/CodeGen/X86/memset-nonzero.ll +++ test/CodeGen/X86/memset-nonzero.ll @@ -4,81 +4,161 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVX --check-prefix=AVX2 define void @memset_16_nonzero_bytes(i8* %x) { -; ANY-LABEL: memset_16_nonzero_bytes: -; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; ANY-NEXT: movq %rax, 8(%rdi) -; ANY-NEXT: movq %rax, (%rdi) -; ANY-NEXT: retq +; SSE2-LABEL: memset_16_nonzero_bytes: +; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A +; SSE2-NEXT: movq %rax, 8(%rdi) +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: memset_16_nonzero_bytes: +; AVX1: vmovaps {{.*#+}} xmm0 = [707406378,707406378,707406378,707406378] +; AVX1-NEXT: vmovups %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_16_nonzero_bytes: +; AVX2: vbroadcastss {{.*}}(%rip), %xmm0 +; AVX2-NEXT: vmovups %xmm0, (%rdi) +; AVX2-NEXT: retq ; %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) ret void } define void @memset_32_nonzero_bytes(i8* %x) { -; ANY-LABEL: memset_32_nonzero_bytes: -; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; ANY-NEXT: movq %rax, 24(%rdi) -; ANY-NEXT: movq %rax, 16(%rdi) -; ANY-NEXT: movq %rax, 8(%rdi) -; ANY-NEXT: movq %rax, (%rdi) -; ANY-NEXT: retq +; SSE2-LABEL: memset_32_nonzero_bytes: +; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A +; SSE2-NEXT: movq %rax, 24(%rdi) +; SSE2-NEXT: movq %rax, 16(%rdi) +; SSE2-NEXT: movq %rax, 8(%rdi) +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: memset_32_nonzero_bytes: +; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13] +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_32_nonzero_bytes: +; AVX2: vbroadcastss {{.*}}(%rip), %ymm0 +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1) ret void } define void @memset_64_nonzero_bytes(i8* %x) { -; ANY-LABEL: memset_64_nonzero_bytes: -; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; ANY-NEXT: movq %rax, 56(%rdi) -; ANY-NEXT: movq %rax, 48(%rdi) -; ANY-NEXT: movq %rax, 40(%rdi) -; ANY-NEXT: movq %rax, 32(%rdi) -; ANY-NEXT: movq %rax, 24(%rdi) -; ANY-NEXT: movq %rax, 16(%rdi) -; ANY-NEXT: movq %rax, 8(%rdi) -; ANY-NEXT: movq %rax, (%rdi) -; ANY-NEXT: retq +; SSE2-LABEL: memset_64_nonzero_bytes: +; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A +; SSE2-NEXT: movq %rax, 56(%rdi) +; SSE2-NEXT: movq %rax, 48(%rdi) +; SSE2-NEXT: movq %rax, 40(%rdi) +; SSE2-NEXT: movq %rax, 32(%rdi) +; SSE2-NEXT: movq %rax, 24(%rdi) +; SSE2-NEXT: movq %rax, 16(%rdi) +; SSE2-NEXT: movq %rax, 8(%rdi) +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: memset_64_nonzero_bytes: +; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13] +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_64_nonzero_bytes: +; AVX2: vbroadcastss {{.*}}(%rip), %ymm0 +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1) ret void } define void @memset_128_nonzero_bytes(i8* %x) { -; ANY-LABEL: memset_128_nonzero_bytes: -; ANY: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; ANY-NEXT: movq %rax, 120(%rdi) -; ANY-NEXT: movq %rax, 112(%rdi) -; ANY-NEXT: movq %rax, 104(%rdi) -; ANY-NEXT: movq %rax, 96(%rdi) -; ANY-NEXT: movq %rax, 88(%rdi) -; ANY-NEXT: movq %rax, 80(%rdi) -; ANY-NEXT: movq %rax, 72(%rdi) -; ANY-NEXT: movq %rax, 64(%rdi) -; ANY-NEXT: movq %rax, 56(%rdi) -; ANY-NEXT: movq %rax, 48(%rdi) -; ANY-NEXT: movq %rax, 40(%rdi) -; ANY-NEXT: movq %rax, 32(%rdi) -; ANY-NEXT: movq %rax, 24(%rdi) -; ANY-NEXT: movq %rax, 16(%rdi) -; ANY-NEXT: movq %rax, 8(%rdi) -; ANY-NEXT: movq %rax, (%rdi) -; ANY-NEXT: retq +; SSE2-LABEL: memset_128_nonzero_bytes: +; SSE2: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A +; SSE2-NEXT: movq %rax, 120(%rdi) +; SSE2-NEXT: movq %rax, 112(%rdi) +; SSE2-NEXT: movq %rax, 104(%rdi) +; SSE2-NEXT: movq %rax, 96(%rdi) +; SSE2-NEXT: movq %rax, 88(%rdi) +; SSE2-NEXT: movq %rax, 80(%rdi) +; SSE2-NEXT: movq %rax, 72(%rdi) +; SSE2-NEXT: movq %rax, 64(%rdi) +; SSE2-NEXT: movq %rax, 56(%rdi) +; SSE2-NEXT: movq %rax, 48(%rdi) +; SSE2-NEXT: movq %rax, 40(%rdi) +; SSE2-NEXT: movq %rax, 32(%rdi) +; SSE2-NEXT: movq %rax, 24(%rdi) +; SSE2-NEXT: movq %rax, 16(%rdi) +; SSE2-NEXT: movq %rax, 8(%rdi) +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: memset_128_nonzero_bytes: +; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13] +; AVX1-NEXT: vmovups %ymm0, 96(%rdi) +; AVX1-NEXT: vmovups %ymm0, 64(%rdi) +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_128_nonzero_bytes: +; AVX2: vbroadcastss {{.*}}(%rip), %ymm0 +; AVX2-NEXT: vmovups %ymm0, 96(%rdi) +; AVX2-NEXT: vmovups %ymm0, 64(%rdi) +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1) ret void } define void @memset_256_nonzero_bytes(i8* %x) { -; ANY-LABEL: memset_256_nonzero_bytes: -; ANY: pushq %rax -; ANY-NEXT: .Ltmp0: -; ANY-NEXT: .cfi_def_cfa_offset 16 -; ANY-NEXT: movl $42, %esi -; ANY-NEXT: movl $256, %edx # imm = 0x100 -; ANY-NEXT: callq memset -; ANY-NEXT: popq %rax -; ANY-NEXT: retq +; SSE2-LABEL: memset_256_nonzero_bytes: +; SSE2: pushq %rax +; SSE2-NEXT: .Ltmp0: +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movl $42, %esi +; SSE2-NEXT: movl $256, %edx # imm = 0x100 +; SSE2-NEXT: callq memset +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; AVX1-LABEL: memset_256_nonzero_bytes: +; AVX1: vmovaps {{.*#+}} ymm0 = [1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13,1.511366e-13] +; AVX1-NEXT: vmovups %ymm0, 224(%rdi) +; AVX1-NEXT: vmovups %ymm0, 192(%rdi) +; AVX1-NEXT: vmovups %ymm0, 160(%rdi) +; AVX1-NEXT: vmovups %ymm0, 128(%rdi) +; AVX1-NEXT: vmovups %ymm0, 96(%rdi) +; AVX1-NEXT: vmovups %ymm0, 64(%rdi) +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_256_nonzero_bytes: +; AVX2: vbroadcastss {{.*}}(%rip), %ymm0 +; AVX2-NEXT: vmovups %ymm0, 224(%rdi) +; AVX2-NEXT: vmovups %ymm0, 192(%rdi) +; AVX2-NEXT: vmovups %ymm0, 160(%rdi) +; AVX2-NEXT: vmovups %ymm0, 128(%rdi) +; AVX2-NEXT: vmovups %ymm0, 96(%rdi) +; AVX2-NEXT: vmovups %ymm0, 64(%rdi) +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1) ret void