diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46091,8 +46091,17 @@ // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. - if (N1SplatC->isOne()) - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + if (N1SplatC->isOne()) { + // NOTE: N0 may be undef at run-time, but (shl N0, 1) must be an even + // number (LSB must be 0). (add undef, undef) however can be any value. + // To make this safe, we must freeze N0 to ensure that register + // allocation uses the same register for an undefined value. This + // ensures that the result will still be even and preserves the original + // semantics. + SDLoc DL(N); + N0 = DAG.getNode(ISD::FREEZE, DL, VT, N0); + return DAG.getNode(ISD::ADD, DL, VT, N0, N0); + } } return SDValue(); diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -248,14 +248,16 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [10,10,10,10] ; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) -; AVX1-NEXT: vmovdqu %xmm0, (%rsi) -; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, (%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vmovdqu %ymm2, (%rsi) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR52039: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -233,35 +233,36 @@ ; AVX1-LABEL: PR42833: ; AVX1: # %bb.0: ; AVX1-NEXT: movl b(%rip), %eax +; AVX1-NEXT: vmovdqu c+128(%rip), %ymm0 ; AVX1-NEXT: addl c+128(%rip), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 -; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vmovdqa c+128(%rip), %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa d+144(%rip), %xmm1 +; AVX1-NEXT: vpsubd c+144(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmovups %ymm0, c+128(%rip) -; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 -; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu c+160(%rip), %ymm0 +; AVX1-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa d+128(%rip), %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqa d+176(%rip), %xmm3 +; AVX1-NEXT: vpsubd c+176(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4 -; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5 -; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip) +; AVX1-NEXT: vpsubd c+160(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa %xmm1, d+144(%rip) ; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip) -; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip) -; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip) -; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 -; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip) -; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip) +; AVX1-NEXT: vmovdqa %xmm3, d+176(%rip) +; AVX1-NEXT: vmovdqa %xmm2, d+128(%rip) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, c+160(%rip) +; AVX1-NEXT: vmovdqa %xmm1, c+176(%rip) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -315,35 +316,36 @@ ; XOP-LABEL: PR42833: ; XOP: # %bb.0: ; XOP-NEXT: movl b(%rip), %eax +; XOP-NEXT: vmovdqu c+128(%rip), %ymm0 ; XOP-NEXT: addl c+128(%rip), %eax -; XOP-NEXT: vmovd %eax, %xmm0 -; XOP-NEXT: vmovdqa c+128(%rip), %xmm1 -; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2 -; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 -; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] -; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 -; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 +; XOP-NEXT: vmovd %eax, %xmm1 +; XOP-NEXT: vmovdqa c+128(%rip), %xmm2 +; XOP-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; XOP-NEXT: vmovdqa d+144(%rip), %xmm1 +; XOP-NEXT: vpsubd c+144(%rip), %xmm1, %xmm1 ; XOP-NEXT: vmovups %ymm0, c+128(%rip) -; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 -; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 -; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 -; XOP-NEXT: vmovdqa c+176(%rip), %xmm3 -; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vmovdqu c+160(%rip), %ymm0 +; XOP-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2 +; XOP-NEXT: vmovdqa d+128(%rip), %xmm3 +; XOP-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; XOP-NEXT: vmovdqa d+176(%rip), %xmm3 +; XOP-NEXT: vpsubd c+176(%rip), %xmm3, %xmm3 ; XOP-NEXT: vmovdqa d+160(%rip), %xmm4 -; XOP-NEXT: vmovdqa c+160(%rip), %xmm5 -; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 -; XOP-NEXT: vmovdqa %xmm2, d+144(%rip) +; XOP-NEXT: vpsubd c+160(%rip), %xmm4, %xmm4 +; XOP-NEXT: vmovdqa %xmm1, d+144(%rip) ; XOP-NEXT: vmovdqa %xmm4, d+160(%rip) -; XOP-NEXT: vmovdqa %xmm1, d+176(%rip) -; XOP-NEXT: vmovdqa %xmm0, d+128(%rip) -; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 -; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; XOP-NEXT: vmovdqa %xmm1, c+160(%rip) -; XOP-NEXT: vmovdqa %xmm0, c+176(%rip) +; XOP-NEXT: vmovdqa %xmm3, d+176(%rip) +; XOP-NEXT: vmovdqa %xmm2, d+128(%rip) +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vmovdqa %xmm0, c+160(%rip) +; XOP-NEXT: vmovdqa %xmm1, c+176(%rip) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %1 = load i32, i32* @b, align 4 diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -138,7 +138,7 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) { ; XOPAVX1-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -146,7 +146,7 @@ ; ; XOPAVX2-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -154,7 +154,7 @@ ; ; AVX512-LABEL: rot_v4i32_mask_ashr1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0 +; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -579,137 +579,154 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: subq $104, %rsp -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: paddq %xmm2, %xmm2 +; X64-NEXT: psllq $31, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: psrad $31, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X64-NEXT: psrlq $31, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm2, %r12 +; X64-NEXT: movq %r12, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %r12, %r14 +; X64-NEXT: movq %r12, %r15 +; X64-NEXT: shlq $31, %r15 ; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r12 +; X64-NEXT: xorl %r12d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rdx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rdx, %r13 ; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: cmovnsq %rcx, %r14 +; X64-NEXT: cmovnsq %rcx, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %r15 +; X64-NEXT: movq %r15, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %r15, %rbp +; X64-NEXT: movq %r15, %r14 +; X64-NEXT: shlq $31, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: sbbq $0, %r13 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: xorl %r15d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovnsq %rcx, %r12 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %r13 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,2,3] -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: paddq %xmm1, %xmm1 -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %rbx -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: sarq $63, %r12 -; X64-NEXT: shldq $31, %rbx, %r12 +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[2,3,2,3] +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: paddq %xmm0, %xmm0 +; X64-NEXT: psllq $31, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrad $31, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X64-NEXT: psrlq $31, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %r13 +; X64-NEXT: movq %r13, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %r13, %rbp +; X64-NEXT: movq %r13, %r14 +; X64-NEXT: shlq $31, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; X64-NEXT: # xmm1 = mem[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 @@ -718,103 +735,101 @@ ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r12, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r12, %rsi +; X64-NEXT: subq $1, %r12 +; X64-NEXT: sbbq $0, %r15 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r13 +; X64-NEXT: xorl %r13d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r13, %rax -; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovnsq %rcx, %r13 -; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovnsq %rcx, %r12 +; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %r15 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 -; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %r15 +; X64-NEXT: movq %r15, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %r15, %rbp +; X64-NEXT: movq %r15, %r14 +; X64-NEXT: shlq $31, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: sbbq $0, %r13 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: xorl %r15d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovnsq %rcx, %r12 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %r13 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -122,10 +122,11 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: subl $160, %esp -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vmovdqa 40(%ebp), %ymm3 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm4 ; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 -; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 -; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 @@ -144,47 +145,46 @@ ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 40(%ebp), %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 56(%ebp), %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 +; CHECK-NEXT: vmovdqa 72(%ebp), %ymm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vmovdqa %xmm1, (%esp) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill -; CHECK-NEXT: vmovdqa 88(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm4 +; CHECK-NEXT: vmovdqa 104(%ebp), %ymm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 -; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 -; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 +; CHECK-NEXT: vmovdqa 136(%ebp), %ymm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm7 +; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vmovdqa 136(%ebp), %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vmovdqa 168(%ebp), %ymm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm6 ; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 -; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 -; CHECK-NEXT: vmovdqa 168(%ebp), %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 ; CHECK-NEXT: movl 8(%ebp), %eax -; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) -; CHECK-NEXT: vmovdqa %xmm4, 224(%eax) -; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) -; CHECK-NEXT: vmovdqa %xmm6, 192(%eax) -; CHECK-NEXT: vmovdqa %xmm0, 176(%eax) -; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) -; CHECK-NEXT: vmovdqa %xmm2, 144(%eax) +; CHECK-NEXT: vmovdqa %xmm1, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm6, 224(%eax) +; CHECK-NEXT: vmovdqa %xmm0, 208(%eax) +; CHECK-NEXT: vmovdqa %xmm7, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm2, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm3, 160(%eax) +; CHECK-NEXT: vmovdqa %xmm4, 144(%eax) ; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps %xmm0, 128(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload