diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28204,6 +28204,7 @@ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // Simple i8 add case + // TODO: Add ISD::FREEZE? if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) return DAG.getNode(ISD::ADD, dl, VT, R, R); @@ -43591,20 +43592,6 @@ } } - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl. - // (shl V, 1) -> add V,V - if (auto *N1BV = dyn_cast(N1)) - if (auto *N1SplatC = N1BV->getConstantSplatNode()) { - assert(N0.getValueType().isVector() && "Invalid vector shift type"); - // We shift all of the values by one. In many cases we do not have - // hardware support for this operation. This is better expressed as an ADD - // of two values. - if (N1SplatC->isOne()) - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); - } - return SDValue(); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5921,6 +5921,30 @@ timm:$src2)), sub_xmm)>; } +// Prefer add to shl-by-one. On many targets, ADD is faster than SHL. +let Predicates = [HasAVX512] in { + def : Pat<(v8i64 (X86vshli v8i64:$src1, 1)), + (VPADDQZrr v8i64:$src1, v8i64:$src1)>; + def : Pat<(v16i32 (X86vshli v16i32:$src1, 1)), + (VPADDDZrr v16i32:$src1, v16i32:$src1)>; + def : Pat<(v32i16 (X86vshli v32i16:$src1, 1)), + (VPADDWZrr v32i16:$src1, v32i16:$src1)>; +} +let Predicates = [HasAVX512, HasVLX] in { + def : Pat<(v4i64 (X86vshli v4i64:$src1, 1)), + (VPADDQZ256rr v4i64:$src1, v4i64:$src1)>; + def : Pat<(v8i32 (X86vshli v8i32:$src1, 1)), + (VPADDDZ256rr v8i32:$src1, v8i32:$src1)>; + def : Pat<(v16i16 (X86vshli v16i16:$src1, 1)), + (VPADDWZ256rr v16i16:$src1, v16i16:$src1)>; + def : Pat<(v2i64 (X86vshli v2i64:$src1, 1)), + (VPADDQZ128rr v2i64:$src1, v2i64:$src1)>; + def : Pat<(v4i32 (X86vshli v4i32:$src1, 1)), + (VPADDDZ128rr v4i32:$src1, v4i32:$src1)>; + def : Pat<(v8i16 (X86vshli v8i16:$src1, 1)), + (VPADDWZ128rr v8i16:$src1, v8i16:$src1)>; +} + //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3622,6 +3622,32 @@ SchedWriteShuffle>; } // ExeDomain = SSEPackedInt +// Prefer add to shl-by-one. On many targets, ADD is faster than SHL. +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v4i64 (X86vshli v4i64:$src1, 1)), + (VPADDQYrr v4i64:$src1, v4i64:$src1)>; + def : Pat<(v8i32 (X86vshli v8i32:$src1, 1)), + (VPADDDYrr v8i32:$src1, v8i32:$src1)>; + def : Pat<(v16i16 (X86vshli v16i16:$src1, 1)), + (VPADDWYrr v16i16:$src1, v16i16:$src1)>; +} +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v2i64 (X86vshli v2i64:$src1, 1)), + (VPADDQrr v2i64:$src1, v2i64:$src1)>; + def : Pat<(v4i32 (X86vshli v4i32:$src1, 1)), + (VPADDDrr v4i32:$src1, v4i32:$src1)>; + def : Pat<(v8i16 (X86vshli v8i16:$src1, 1)), + (VPADDWrr v8i16:$src1, v8i16:$src1)>; +} +let Predicates = [UseSSE2] in { + def : Pat<(v2i64 (X86vshli v2i64:$src1, 1)), + (PADDQrr v2i64:$src1, v2i64:$src1)>; + def : Pat<(v4i32 (X86vshli v4i32:$src1, 1)), + (PADDDrr v4i32:$src1, v4i32:$src1)>; + def : Pat<(v8i16 (X86vshli v8i16:$src1, 1)), + (PADDWrr v8i16:$src1, v8i16:$src1)>; +} + //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Comparison Instructions //===---------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -81,7 +81,7 @@ ; SSE-LABEL: combine_vec_mul_pow2c: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psllq $1, %xmm2 +; SSE-NEXT: paddq %xmm0, %xmm2 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psllq $4, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2194,7 +2194,7 @@ ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: psllw $1, %xmm2 +; SSE41-NEXT: paddw %xmm2, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] @@ -2206,7 +2206,7 @@ ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psraw $8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $1, %xmm3 +; SSE41-NEXT: paddw %xmm0, %xmm3 ; SSE41-NEXT: psllw $7, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] ; SSE41-NEXT: psrlw $8, %xmm0 @@ -2229,7 +2229,7 @@ ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] @@ -2239,7 +2239,7 @@ ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 ; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/lower-vec-shift.ll b/llvm/test/CodeGen/X86/lower-vec-shift.ll --- a/llvm/test/CodeGen/X86/lower-vec-shift.ll +++ b/llvm/test/CodeGen/X86/lower-vec-shift.ll @@ -265,11 +265,11 @@ ; AVX1-LABEL: test11: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7] ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -294,10 +294,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -305,7 +305,7 @@ ; AVX2-LABEL: test12: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15] ; AVX2-NEXT: retq %lshr = shl <16 x i16> %a, diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -201,7 +201,7 @@ ; SSE42-NEXT: movdqa c+128(%rip), %xmm1 ; SSE42-NEXT: movd %xmm1, %eax ; SSE42-NEXT: addl b(%rip), %eax -; SSE42-NEXT: movd %eax, %xmm2 +; SSE42-NEXT: pinsrd $0, %eax, %xmm2 ; SSE42-NEXT: paddd %xmm1, %xmm2 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 ; SSE42-NEXT: psubd %xmm0, %xmm3 @@ -235,7 +235,7 @@ ; AVX1-NEXT: vmovdqa c+128(%rip), %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: addl b(%rip), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 @@ -317,7 +317,7 @@ ; XOP-NEXT: vmovdqa c+128(%rip), %xmm0 ; XOP-NEXT: vmovd %xmm0, %eax ; XOP-NEXT: addl b(%rip), %eax -; XOP-NEXT: vmovd %eax, %xmm1 +; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2 ; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -578,247 +578,254 @@ ; X64-NEXT: subq $104, %rsp ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-NEXT: movdqa %xmm3, %xmm2 +; X64-NEXT: psrad $31, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64-NEXT: psrlq $31, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp +; X64-NEXT: movq %xmm0, %r12 +; X64-NEXT: movq %r12, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %r12, %r14 +; X64-NEXT: movq %r12, %r15 +; X64-NEXT: shlq $31, %r15 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r12 +; X64-NEXT: xorl %r12d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rdx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rdx, %r13 ; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: cmovnsq %rcx, %r14 +; X64-NEXT: cmovnsq %rcx, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %r15 +; X64-NEXT: movq %r15, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %r15, %rbp +; X64-NEXT: movq %r15, %r14 +; X64-NEXT: shlq $31, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: sbbq $0, %r13 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: xorl %r15d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovnsq %rcx, %r12 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %r13 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,2,3] -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: paddq %xmm1, %xmm1 -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %rbx -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: sarq $63, %r12 -; X64-NEXT: shldq $31, %rbx, %r12 -; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %rdx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrad $31, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X64-NEXT: psrlq $31, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %r13 +; X64-NEXT: movq %r13, %rbp ; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r12, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: shldq $31, %r13, %rbp +; X64-NEXT: movq %r13, %r14 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r12, %rsi +; X64-NEXT: subq $1, %r12 +; X64-NEXT: sbbq $0, %r15 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r13 +; X64-NEXT: xorl %r13d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r13, %rax -; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovnsq %rcx, %r13 -; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovnsq %rcx, %r12 +; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %r15 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 -; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: testq %r15, %r15 +; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %xmm0, %r15 +; X64-NEXT: movq %r15, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %r15, %rbp +; X64-NEXT: movq %r15, %r14 +; X64-NEXT: shlq $31, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: sbbq $0, %r13 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: xorl %r15d, %ebx ; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovnsq %rcx, %r12 ; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r12, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %r13, %r13 ; X64-NEXT: cmovsq %rcx, %r12 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %r13 ; X64-NEXT: cmoveq %rax, %r12 -; X64-NEXT: movq %r12, %xmm0 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-NEXT: movq %r12, %xmm1 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0,2],mem[0,2] ; X64-NEXT: addq $104, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -837,116 +844,104 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $256, %esp # imm = 0x100 -; X86-NEXT: movl 24(%ebp), %edx -; X86-NEXT: movl 40(%ebp), %edi -; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: adcl %ecx, %ecx -; X86-NEXT: andl $1, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shldl $31, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %eax +; X86-NEXT: movl 16(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %esi, %esi +; X86-NEXT: shrdl $1, %ebx, %esi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %ebx -; X86-NEXT: calll __modti3 +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %eax +; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl 32(%ebp) +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %eax +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp ; X86-NEXT: movl 36(%ebp), %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl 20(%ebp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %esi, %esi -; X86-NEXT: andl $1, %esi -; X86-NEXT: negl %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: shldl $31, %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %ecx +; X86-NEXT: shrdl $1, %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl 28(%ebp), %ebx -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl 12(%ebp), %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: addl %esi, %esi -; X86-NEXT: adcl %ecx, %ecx -; X86-NEXT: andl $1, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shldl $31, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: shrdl $1, %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 32(%ebp), %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl 16(%ebp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %ebx, %ebx -; X86-NEXT: andl $1, %ebx -; X86-NEXT: negl %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl $31, %ecx, %edi -; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: shrdl $1, %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl $0 +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp @@ -955,40 +950,24 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl %eax -; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx ; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp @@ -996,8 +975,8 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1007,19 +986,19 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sets %bl ; X86-NEXT: testl %edi, %edi -; X86-NEXT: sets %al -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %ah -; X86-NEXT: xorb %al, %ah -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %edi, %edx +; X86-NEXT: sets %bh +; X86-NEXT: xorb %bl, %bh +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: setne %al -; X86-NEXT: testb %ah, %al -; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: testb %bh, %al +; X86-NEXT: cmovel %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -1045,7 +1024,7 @@ ; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bh @@ -1091,14 +1070,14 @@ ; X86-NEXT: sets %bl ; X86-NEXT: xorb %al, %bl ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl 28(%ebp) ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl 28(%ebp) +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -5846,17 +5846,17 @@ define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi16: ; SSE: # %bb.0: -; SSE-NEXT: psllw $1, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x01] +; SSE-NEXT: paddw %xmm0, %xmm0 # encoding: [0x66,0x0f,0xfd,0xc0] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x01] +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc0] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x01] +; AVX512-NEXT: vpaddw %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1) @@ -5868,17 +5868,17 @@ define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi32: ; SSE: # %bb.0: -; SSE-NEXT: pslld $1, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x01] +; SSE-NEXT: paddd %xmm0, %xmm0 # encoding: [0x66,0x0f,0xfe,0xc0] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x01] +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc0] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x01] +; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1) @@ -5890,17 +5890,17 @@ define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi64: ; SSE: # %bb.0: -; SSE-NEXT: psllq $1, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x01] +; SSE-NEXT: paddq %xmm0, %xmm0 # encoding: [0x66,0x0f,0xd4,0xc0] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x01] +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xd4,0xc0] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x01] +; AVX512-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -6312,7 +6312,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1) @@ -6394,7 +6395,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1) @@ -6506,7 +6508,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1) diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -311,74 +311,70 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm8, %xmm8 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; X64-NEXT: paddq %xmm2, %xmm2 -; X64-NEXT: psllq $31, %xmm2 -; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-NEXT: movq %xmm3, %rax +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm7 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X64-NEXT: movq %xmm2, %rax -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: movq %xmm2, %rcx +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X64-NEXT: movq %xmm3, %rax +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; X64-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; X64-NEXT: movdqa %xmm7, %xmm2 -; X64-NEXT: pxor %xmm3, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm9 = [9223372043297226751,9223372043297226751] -; X64-NEXT: movdqa %xmm9, %xmm6 -; X64-NEXT: pcmpgtd %xmm2, %xmm6 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; X64-NEXT: pcmpeqd %xmm9, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; X64-NEXT: pand %xmm4, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X64-NEXT: por %xmm5, %xmm2 +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; X64-NEXT: movdqa %xmm7, %xmm3 +; X64-NEXT: pxor %xmm4, %xmm3 +; X64-NEXT: movdqa {{.*#+}} xmm8 = [9223372043297226751,9223372043297226751] +; X64-NEXT: movdqa %xmm8, %xmm6 +; X64-NEXT: pcmpgtd %xmm3, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm8, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; X64-NEXT: pand %xmm9, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; X64-NEXT: por %xmm5, %xmm3 ; X64-NEXT: movdqa {{.*#+}} xmm6 = [8589934591,8589934591] -; X64-NEXT: pand %xmm2, %xmm7 -; X64-NEXT: pandn %xmm6, %xmm2 -; X64-NEXT: por %xmm7, %xmm2 -; X64-NEXT: psrlq $1, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; X64-NEXT: paddq %xmm0, %xmm0 -; X64-NEXT: psllq $31, %xmm0 -; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: pand %xmm3, %xmm7 +; X64-NEXT: pandn %xmm6, %xmm3 +; X64-NEXT: por %xmm7, %xmm3 +; X64-NEXT: psrlq $1, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: movq %rax, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: psrlq $32, %xmm1 ; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; X64-NEXT: pxor %xmm4, %xmm3 -; X64-NEXT: movdqa %xmm9, %xmm0 -; X64-NEXT: pcmpgtd %xmm3, %xmm0 +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; X64-NEXT: pxor %xmm5, %xmm4 +; X64-NEXT: movdqa %xmm8, %xmm0 +; X64-NEXT: pcmpgtd %xmm4, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; X64-NEXT: pcmpeqd %xmm9, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; X64-NEXT: pand %xmm1, %xmm3 +; X64-NEXT: pcmpeqd %xmm8, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-NEXT: pand %xmm1, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-NEXT: por %xmm3, %xmm0 -; X64-NEXT: pand %xmm0, %xmm4 +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: pand %xmm0, %xmm5 ; X64-NEXT: pandn %xmm6, %xmm0 -; X64-NEXT: por %xmm4, %xmm0 +; X64-NEXT: por %xmm5, %xmm0 ; X64-NEXT: psrlq $1, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; X64-NEXT: retq ; ; X86-LABEL: vec: @@ -387,93 +383,84 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax), %ecx +; X86-NEXT: shrl $31, %eax ; X86-NEXT: shldl $31, %ecx, %eax -; X86-NEXT: shll $31, %ecx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %ebp, %ebp -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %ebp, %eax -; X86-NEXT: shll $31, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: leal (%ebx,%ebx), %eax +; X86-NEXT: shrl $31, %ebx +; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %edi, %edi -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %edi, %eax -; X86-NEXT: shll $31, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: leal (%esi,%esi), %eax +; X86-NEXT: shrl $31, %esi +; X86-NEXT: shldl $31, %eax, %esi ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %esi, %esi -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %esi, %eax -; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: leal (%edx,%edx), %ecx +; X86-NEXT: shrl $31, %edx +; X86-NEXT: shldl $31, %ecx, %edx +; X86-NEXT: cmpl $2, %esi +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: movl $1, %ebp +; X86-NEXT: cmovael %ebp, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: cmpl $2, %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: cmpl $1, %ebx +; X86-NEXT: cmovael %ebp, %ebx +; X86-NEXT: shldl $31, %eax, %ebx +; X86-NEXT: cmpl $2, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: cmpl $1, %edi +; X86-NEXT: cmovael %ebp, %edi +; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edx +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $2, %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: cmpl $1, %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: cmovael %esi, %edx -; X86-NEXT: shldl $31, %eax, %edx -; X86-NEXT: cmpl $2, %edi -; X86-NEXT: cmovael %ecx, %ebx -; X86-NEXT: cmpl $1, %edi -; X86-NEXT: cmovael %esi, %edi -; X86-NEXT: shldl $31, %ebx, %edi -; X86-NEXT: cmpl $2, %ebp -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: cmovael %ecx, %eax -; X86-NEXT: cmpl $1, %ebp -; X86-NEXT: cmovael %esi, %ebp +; X86-NEXT: cmovbl %edx, %ebp ; X86-NEXT: shldl $31, %eax, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: cmpl $2, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovael %ecx, %eax -; X86-NEXT: cmpl $1, %ebx -; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: shldl $31, %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $12, %esp +; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -68,7 +68,7 @@ ; SSE2-LABEL: test4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $1, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -76,7 +76,7 @@ ; SSE41-LABEL: test4: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pslld $1, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -36,7 +36,7 @@ ; SSE2-NEXT: psrlq %xmm4, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: psllq $1, %xmm0 +; SSE2-NEXT: paddq %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -56,7 +56,7 @@ ; SSE41-NEXT: psrlq %xmm4, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 -; SSE41-NEXT: psllq $1, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllq %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -74,7 +74,7 @@ ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -88,7 +88,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -99,7 +99,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -110,7 +110,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -121,7 +121,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -142,7 +142,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -157,7 +157,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -172,7 +172,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -188,7 +188,7 @@ ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: psllq $1, %xmm0 +; X86-SSE2-NEXT: paddq %xmm0, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -225,7 +225,7 @@ ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; SSE2-NEXT: pslld $1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -260,7 +260,7 @@ ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: pslld $1, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -285,7 +285,7 @@ ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -296,7 +296,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -307,7 +307,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -318,7 +318,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -329,7 +329,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -350,7 +350,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -365,7 +365,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -380,7 +380,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -409,7 +409,7 @@ ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; X86-SSE2-NEXT: pslld $1, %xmm0 +; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -475,7 +475,7 @@ ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 @@ -521,7 +521,7 @@ ; SSE41-NEXT: paddd %xmm4, %xmm0 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: psllw $1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm0, %xmm3 ; SSE41-NEXT: por %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -556,7 +556,7 @@ ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -572,7 +572,7 @@ ; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] @@ -590,7 +590,7 @@ ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -608,7 +608,7 @@ ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -623,7 +623,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -645,7 +645,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -660,7 +660,7 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -720,7 +720,7 @@ ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-SSE2-NEXT: psllw $1, %xmm0 +; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 @@ -1069,7 +1069,7 @@ ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: psrlq %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: psllq $1, %xmm0 +; SSE-NEXT: paddq %xmm0, %xmm0 ; SSE-NEXT: psllq %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1080,7 +1080,7 @@ ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1091,7 +1091,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1102,7 +1102,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1113,7 +1113,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1134,7 +1134,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1152,7 +1152,7 @@ ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -1169,7 +1169,7 @@ ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: psllq $1, %xmm0 +; X86-SSE2-NEXT: paddq %xmm0, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -1190,7 +1190,7 @@ ; SSE2-NEXT: andl $31, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: psrld %xmm2, %xmm1 -; SSE2-NEXT: pslld $1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: notl %eax ; SSE2-NEXT: andl $31, %eax ; SSE2-NEXT: movd %eax, %xmm2 @@ -1207,7 +1207,7 @@ ; SSE41-NEXT: psrld %xmm4, %xmm1 ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; SSE41-NEXT: pslld $1, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: pslld %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1220,7 +1220,7 @@ ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1233,7 +1233,7 @@ ; AVX2-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1246,7 +1246,7 @@ ; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1259,7 +1259,7 @@ ; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1272,7 +1272,7 @@ ; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1295,7 +1295,7 @@ ; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1315,7 +1315,7 @@ ; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq @@ -1328,7 +1328,7 @@ ; XOPAVX2-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -1340,7 +1340,7 @@ ; X86-SSE2-NEXT: andl $31, %ecx ; X86-SSE2-NEXT: movd %ecx, %xmm2 ; X86-SSE2-NEXT: psrld %xmm2, %xmm1 -; X86-SSE2-NEXT: pslld $1, %xmm0 +; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: notl %eax ; X86-SSE2-NEXT: andl $31, %eax ; X86-SSE2-NEXT: movd %eax, %xmm2 @@ -1364,7 +1364,7 @@ ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: psllw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1378,7 +1378,7 @@ ; SSE41-NEXT: psrlw %xmm4, %xmm1 ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE41-NEXT: psllw $1, %xmm0 +; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: psllw %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1391,7 +1391,7 @@ ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1404,7 +1404,7 @@ ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1417,7 +1417,7 @@ ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1430,7 +1430,7 @@ ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1453,7 +1453,7 @@ ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1473,7 +1473,7 @@ ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -1489,7 +1489,7 @@ ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSE2-NEXT: psllw $1, %xmm0 +; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -1877,7 +1877,7 @@ ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm2, %xmm3 -; X86-SSE2-NEXT: psllq $1, %xmm0 +; X86-SSE2-NEXT: paddq %xmm0, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psllq %xmm3, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] @@ -2041,7 +2041,7 @@ ; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -2052,7 +2052,7 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = ; SSE41-NEXT: pmulhuw %xmm1, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: psllw $1, %xmm0 +; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -2061,7 +2061,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -2070,7 +2070,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -2079,7 +2079,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -2090,7 +2090,7 @@ ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -2109,7 +2109,7 @@ ; AVX512VLBW-LABEL: constant_funnnel_v8i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -2123,7 +2123,7 @@ ; XOP-LABEL: constant_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -2135,7 +2135,7 @@ ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psllw $1, %xmm0 +; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -38,12 +38,12 @@ ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllq $1, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -58,7 +58,7 @@ ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -69,7 +69,7 @@ ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -80,7 +80,7 @@ ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -91,7 +91,7 @@ ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -111,7 +111,7 @@ ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -128,9 +128,9 @@ ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; XOPAVX1-NEXT: vpsllq $1, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpaddq %xmm6, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 @@ -151,7 +151,7 @@ ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -197,12 +197,12 @@ ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpslld $1, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -214,7 +214,7 @@ ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -225,7 +225,7 @@ ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -236,7 +236,7 @@ ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -247,7 +247,7 @@ ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -267,7 +267,7 @@ ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -284,9 +284,9 @@ ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; XOPAVX1-NEXT: vpslld $1, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpaddd %xmm6, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 @@ -307,7 +307,7 @@ ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; XOPAVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -364,7 +364,7 @@ ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 @@ -375,7 +375,7 @@ ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -397,7 +397,7 @@ ; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] ; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 @@ -418,7 +418,7 @@ ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 @@ -434,7 +434,7 @@ ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 @@ -448,7 +448,7 @@ ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -468,7 +468,7 @@ ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -485,9 +485,9 @@ ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; XOPAVX1-NEXT: vpsllw $1, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 @@ -507,7 +507,7 @@ ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 ; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 @@ -818,9 +818,9 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsllq $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -832,7 +832,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -843,7 +843,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -854,7 +854,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -865,7 +865,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -885,7 +885,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -907,9 +907,9 @@ ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpsllq $1, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -921,7 +921,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -943,9 +943,9 @@ ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpslld $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -959,7 +959,7 @@ ; AVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -972,7 +972,7 @@ ; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpslld %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -985,7 +985,7 @@ ; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpslld %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -998,7 +998,7 @@ ; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512BW-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpslld %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1020,7 +1020,7 @@ ; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VLBW-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpslld %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1044,9 +1044,9 @@ ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpslld $1, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1060,7 +1060,7 @@ ; XOPAVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; XOPAVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1082,9 +1082,9 @@ ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsllw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1098,7 +1098,7 @@ ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1111,7 +1111,7 @@ ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1124,7 +1124,7 @@ ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1137,7 +1137,7 @@ ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1159,7 +1159,7 @@ ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1183,9 +1183,9 @@ ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpsllw $1, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1199,7 +1199,7 @@ ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1616,10 +1616,10 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1630,7 +1630,7 @@ ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1640,7 +1640,7 @@ ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1650,7 +1650,7 @@ ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1661,7 +1661,7 @@ ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1678,7 +1678,7 @@ ; AVX512VLBW-LABEL: constant_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1695,10 +1695,10 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1709,7 +1709,7 @@ ; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -22,7 +22,7 @@ ; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -33,7 +33,7 @@ ; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -44,7 +44,7 @@ ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -61,7 +61,7 @@ ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -82,7 +82,7 @@ ; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -93,7 +93,7 @@ ; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -104,7 +104,7 @@ ; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -121,7 +121,7 @@ ; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -153,14 +153,14 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm4 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm4 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpsllvd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -185,14 +185,14 @@ ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 ; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm4 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm4 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm4, %zmm3 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 @@ -206,7 +206,7 @@ ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -223,7 +223,7 @@ ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -534,7 +534,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -545,7 +545,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -556,7 +556,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -574,7 +574,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -599,7 +599,7 @@ ; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -612,7 +612,7 @@ ; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -625,7 +625,7 @@ ; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpslld %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -645,7 +645,7 @@ ; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VLBW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpslld %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -674,9 +674,9 @@ ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsllw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -694,9 +694,9 @@ ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsllw $1, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -710,7 +710,7 @@ ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -730,7 +730,7 @@ ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -994,10 +994,10 @@ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpsllw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -1015,10 +1015,10 @@ ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vpsllw $1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1] ; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -1027,7 +1027,7 @@ ; AVX512BW-LABEL: constant_funnnel_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -1041,7 +1041,7 @@ ; AVX512VLBW-LABEL: constant_funnnel_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: pmullw %xmm1, %xmm2 ; CHECK-NEXT: psrlw $15, %xmm2 ; CHECK-NEXT: pmulhw %xmm1, %xmm0 -; CHECK-NEXT: psllw $1, %xmm0 +; CHECK-NEXT: paddw %xmm0, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> , <4 x i16> %a, i32 15) @@ -33,7 +33,7 @@ ; CHECK-NEXT: pmullw %xmm1, %xmm2 ; CHECK-NEXT: psrlw $15, %xmm2 ; CHECK-NEXT: pmulhuw %xmm1, %xmm0 -; CHECK-NEXT: psllw $1, %xmm0 +; CHECK-NEXT: paddw %xmm0, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.umul.fix.v4i16(<4 x i16> , <4 x i16> %a, i32 15) diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -683,7 +683,7 @@ ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq $1, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: psllq $7, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq @@ -692,14 +692,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllq $7, %xmm1 -; SSE41-NEXT: psllq $1, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; @@ -731,7 +731,7 @@ ; X86-SSE-LABEL: constant_shift_v2i64: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psllq $1, %xmm1 +; X86-SSE-NEXT: paddq %xmm0, %xmm1 ; X86-SSE-NEXT: psllq $7, %xmm0 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -753,7 +753,7 @@ ; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm2 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -793,7 +793,7 @@ ; X86-AVX1-NEXT: vpsllq $31, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vpsllq $7, %xmm0, %xmm2 -; X86-AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll --- a/llvm/test/CodeGen/X86/widen_arith-4.ll +++ b/llvm/test/CodeGen/X86/widen_arith-4.ll @@ -65,7 +65,7 @@ ; SSE41-NEXT: psubw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psllw $2, %xmm2 -; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: paddw %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax) ; SSE41-NEXT: movq %xmm2, (%rcx,%rax)