diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -713,6 +713,22 @@ return Op.getOperand(1); break; } + case ISD::SHL: { + // If we are only demanding sign bits then we can use the shift source + // directly. + if (const APInt *MaxSA = + DAG.getValidMaximumShiftAmountConstant(Op, DemandedElts)) { + SDValue Op0 = Op.getOperand(0); + unsigned ShAmt = MaxSA->getZExtValue(); + unsigned BitWidth = DemandedBits.getBitWidth(); + unsigned NumSignBits = + DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits)) + return Op0; + } + break; + } case ISD::SETCC: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -1441,6 +1457,18 @@ ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) return true; } + + // If we are only demanding sign bits then we can use the shift source + // directly. + if (const APInt *MaxSA = + TLO.DAG.getValidMaximumShiftAmountConstant(Op, DemandedElts)) { + unsigned ShAmt = MaxSA->getZExtValue(); + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits)) + return TLO.CombineTo(Op, Op0); + } break; } case ISD::SRL: { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37155,6 +37155,14 @@ } } + // If we are only demanding sign bits then we can use the shift source directly. + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1); + unsigned UpperDemandedBits = + BitWidth - OriginalDemandedBits.countTrailingZeros(); + if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) + return TLO.CombineTo(Op, Op0); + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; @@ -37432,7 +37440,19 @@ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) && !DemandedElts[CIdx->getZExtValue()]) return Vec; - break; + break; + } + case X86ISD::VSHLI: { + // If we are only demanding sign bits then we can use the shift source + // directly. + SDValue Op0 = Op.getOperand(0); + unsigned ShAmt = Op.getConstantOperandVal(1); + unsigned BitWidth = DemandedBits.getBitWidth(); + unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) + return Op0; + break; } case X86ISD::VSRAI: // iff we only need the sign bit then we can use the source directly. diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -1921,17 +1921,16 @@ ; GCN-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 ; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 ; GCN-NEXT: v_mul_f32_e32 v3, v0, v4 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1960,17 +1959,16 @@ ; TONGA-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 ; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 ; TONGA-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 -; TONGA-NEXT: s_waitcnt vmcnt(2) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_or_b32_e32 v0, v0, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; TONGA-NEXT: v_or_b32_e32 v2, v2, v3 +; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; TONGA-NEXT: v_or_b32_e32 v2, v2, v4 ; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v2 +; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; TONGA-NEXT: v_or_b32_e32 v0, v0, v4 ; TONGA-NEXT: v_cvt_f32_i32_e32 v0, v0 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v3 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; TONGA-NEXT: v_or_b32_e32 v1, 1, v1 ; TONGA-NEXT: v_mul_f32_e32 v3, v0, v4 ; TONGA-NEXT: v_trunc_f32_e32 v3, v3 @@ -1999,17 +1997,16 @@ ; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, v0, v4 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -510,7 +510,6 @@ ; AVX1-LABEL: v16i8_widened_with_ones: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %ecx ; AVX1-NEXT: orl $-65536, %ecx # imm = 0xFFFF0000 ; AVX1-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -922,6 +922,7 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind { ; X32-SSE2-LABEL: PR45265: ; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pushl %ebx ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -929,24 +930,27 @@ ; X32-SSE2-NEXT: leal (%eax,%eax,2), %edx ; X32-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi ; X32-SSE2-NEXT: movsbl 10(%ecx,%edx,4), %edi -; X32-SSE2-NEXT: shll $16, %edi -; X32-SSE2-NEXT: orl %edi, %esi +; X32-SSE2-NEXT: movl %edi, %ebx +; X32-SSE2-NEXT: shll $16, %ebx +; X32-SSE2-NEXT: orl %esi, %ebx ; X32-SSE2-NEXT: movl 4(%ecx,%edx,4), %ecx -; X32-SSE2-NEXT: shrdl $8, %esi, %ecx +; X32-SSE2-NEXT: shrdl $8, %ebx, %ecx ; X32-SSE2-NEXT: xorl %eax, %ecx ; X32-SSE2-NEXT: sarl $31, %eax ; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: shldl $24, %esi, %edi +; X32-SSE2-NEXT: shldl $24, %ebx, %edi ; X32-SSE2-NEXT: xorl %eax, %edi ; X32-SSE2-NEXT: orl %edi, %ecx ; X32-SSE2-NEXT: jne .LBB44_1 ; X32-SSE2-NEXT: # %bb.2: ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: popl %ebx ; X32-SSE2-NEXT: jmp _Z3foov # TAILCALL ; X32-SSE2-NEXT: .LBB44_1: ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: popl %ebx ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: PR45265: diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -74,8 +74,6 @@ ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $63, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -597,114 +597,112 @@ ; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %r15, %r12 -; X64-NEXT: shldq $31, %rbx, %r12 +; X64-NEXT: movq %rbx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %rbx, %rbp ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %rbx -; X64-NEXT: movq %rbx, %rdi -; X64-NEXT: movq %r12, %rsi -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: shlq $31, %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %rbp -; X64-NEXT: movq %rbx, %rdi -; X64-NEXT: movq %r12, %rsi +; X64-NEXT: sbbq $0, %r14 +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: shrq $63, %r15 -; X64-NEXT: xorl %r14d, %r15d -; X64-NEXT: testb %r15b, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: testb %bl, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rdx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testq %rbp, %rbp +; X64-NEXT: testq %r14, %r14 ; X64-NEXT: cmovnsq %rdx, %r13 ; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: cmovnsq %rcx, %rbp +; X64-NEXT: cmovnsq %rcx, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %rbp, %rbp +; X64-NEXT: testq %r14, %r14 ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %rbp +; X64-NEXT: cmpq $-1, %r14 ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] -; X64-NEXT: movq %xmm0, %rbp -; X64-NEXT: movq %rbp, %rbx -; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: shldq $31, %rbp, %r13 +; X64-NEXT: sarq $63, %r13 +; X64-NEXT: shldq $31, %rbx, %r13 ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %rbp -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: movq %rbx, %r15 +; X64-NEXT: shlq $31, %r15 +; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: sbbq $0, %r14 +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %ebp, %ebx +; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: shlq $31, %rbx -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r14d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r15, %r15 +; X64-NEXT: testq %r14, %r14 ; X64-NEXT: cmovnsq %rcx, %r12 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r15 +; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r15, %r15 +; X64-NEXT: testq %r14, %r14 ; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmpq $-1, %r14 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -718,11 +716,10 @@ ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: paddq %xmm1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %rbp -; X64-NEXT: movq %rbp, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: movq %r14, %r13 -; X64-NEXT: shldq $31, %rbp, %r13 +; X64-NEXT: movq %xmm1, %rbx +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shldq $31, %rbx, %r12 ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; X64-NEXT: # xmm1 = mem[2,3,0,1] ; X64-NEXT: pxor %xmm0, %xmm0 @@ -731,104 +728,103 @@ ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: sarq $63, %rbx -; X64-NEXT: shlq $31, %rbp -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: movq %rbx, %r15 +; X64-NEXT: shlq $31, %r15 +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __divti3 -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %r14 +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %ebp, %ebx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r12, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: shlq $31, %r14 -; X64-NEXT: shrq $63, %r14 -; X64-NEXT: xorl %ebx, %r14d -; X64-NEXT: testb %r14b, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: testb %bl, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r15, %r15 -; X64-NEXT: cmovnsq %rcx, %r12 -; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: cmovbq %r13, %rax +; X64-NEXT: testq %r14, %r14 +; X64-NEXT: cmovnsq %rcx, %r13 +; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r15 +; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r15, %r15 -; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r15 -; X64-NEXT: cmoveq %rax, %r12 -; X64-NEXT: movq %r12, %xmm0 +; X64-NEXT: cmovaq %r13, %rax +; X64-NEXT: testq %r14, %r14 +; X64-NEXT: cmovsq %rcx, %r13 +; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] -; X64-NEXT: movq %xmm0, %rbp -; X64-NEXT: movq %rbp, %rbx -; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: shldq $31, %rbp, %r13 +; X64-NEXT: sarq $63, %r13 +; X64-NEXT: shldq $31, %rbx, %r13 ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %rbp -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: movq %rbx, %r15 +; X64-NEXT: shlq $31, %r15 +; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: sbbq $0, %r14 +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %ebp, %ebx +; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: shlq $31, %rbx -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r14d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r15, %r15 +; X64-NEXT: testq %r14, %r14 ; X64-NEXT: cmovnsq %rcx, %r12 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r15 +; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r15, %r15 +; X64-NEXT: testq %r14, %r14 ; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmpq $-1, %r14 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -59,7 +59,7 @@ ; CHECK-NEXT: pextrw $1, %xmm0, %esi ; CHECK-NEXT: movswl %si, %edi ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $15, %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: leal (%rdi,%rdi), %esi ; CHECK-NEXT: shrdw $15, %ax, %si ; CHECK-NEXT: sarl $15, %edi diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -1577,7 +1577,6 @@ ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax ; AVX1-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -1566,7 +1566,6 @@ ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testw %ax, %ax ; AVX1-NEXT: setne %al @@ -1657,7 +1656,6 @@ ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testw %ax, %ax ; AVX1-NEXT: setne %al