diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41806,6 +41806,23 @@ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) return Op.getOperand(1); break; + case X86ISD::ANDNP: { + // ANDNP = (~LHS & RHS); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1); + KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1); + + // If all of the demanded bits are known 1 on one side, return the other. + // These bits cannot contribute to the result of the 'andn' in this + // context. + if (DemandedBits.isSubsetOf(LHSKnown.One | RHSKnown.One)) + return LHS; + if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) + return RHS; + break; + } } APInt ShuffleUndef, ShuffleZero; diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -690,20 +690,22 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psubw %xmm1, %xmm2 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -85,9 +85,8 @@ ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm2, %xmm1 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: por %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -231,18 +230,18 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-LABEL: utest_f32i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: subss %xmm2, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: cvttss2si %xmm0, %rcx +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: subss %xmm2, %xmm0 +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: cvttss2si %xmm1, %rcx ; CHECK-NEXT: movq %rcx, %rdx ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] ; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: subss %xmm2, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %rcx @@ -251,9 +250,9 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] ; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: subss %xmm2, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %rcx @@ -262,45 +261,42 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss %xmm2, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: subss %xmm2, %xmm1 +; CHECK-NEXT: cvttss2si %xmm1, %rcx ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm0 -; CHECK-NEXT: pandn %xmm2, %xmm5 -; CHECK-NEXT: por %xmm0, %xmm5 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm6 +; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] ; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm1 +; CHECK-NEXT: por %xmm4, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm2, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; CHECK-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i64> @@ -549,40 +545,36 @@ ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: pand %xmm4, %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm4 -; CHECK-NEXT: por %xmm0, %xmm4 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-NEXT: movq %rdx, %xmm6 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; CHECK-NEXT: # xmm6 = xmm6[0],mem[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; CHECK-NEXT: movdqa %xmm6, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] ; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm6, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pandn %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm6 +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm1, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -738,9 +730,8 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <2147549183,2147549183,u,u> ; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: andpd %xmm2, %xmm0 -; CHECK-NEXT: andnpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: orpd %xmm0, %xmm2 -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; CHECK-NEXT: orpd %xmm2, %xmm0 +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -807,9 +798,8 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq @@ -1697,9 +1687,8 @@ ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm2, %xmm1 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: por %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -1838,18 +1827,18 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: utest_f32i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: subss %xmm2, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: cvttss2si %xmm0, %rcx +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: subss %xmm2, %xmm0 +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: cvttss2si %xmm1, %rcx ; CHECK-NEXT: movq %rcx, %rdx ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] ; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: subss %xmm2, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %rcx @@ -1858,9 +1847,9 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] ; CHECK-NEXT: cvttss2si %xmm3, %rax ; CHECK-NEXT: subss %xmm2, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %rcx @@ -1869,18 +1858,18 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: subss %xmm2, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %rcx +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: subss %xmm2, %xmm1 +; CHECK-NEXT: cvttss2si %xmm1, %rcx ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 @@ -1891,23 +1880,20 @@ ; CHECK-NEXT: pand %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm4, %xmm0 -; CHECK-NEXT: pandn %xmm3, %xmm4 -; CHECK-NEXT: por %xmm0, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-NEXT: pand %xmm4, %xmm1 +; CHECK-NEXT: por %xmm4, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; CHECK-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i64> @@ -2151,11 +2137,11 @@ ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movq %rdx, %xmm6 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; CHECK-NEXT: # xmm6 = xmm6[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm6, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm1, %xmm3 @@ -2166,25 +2152,21 @@ ; CHECK-NEXT: pand %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-NEXT: pand %xmm3, %xmm6 +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm6, %xmm1 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm2, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm1, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2335,9 +2317,8 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <2147549183,2147549183,u,u> ; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: andpd %xmm2, %xmm0 -; CHECK-NEXT: andnpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: orpd %xmm0, %xmm2 -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; CHECK-NEXT: orpd %xmm2, %xmm0 +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -2399,9 +2380,8 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -12,53 +12,50 @@ ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: pxor %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pand %xmm9, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm10 -; SSE2-NEXT: movdqa %xmm12, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm13 -; SSE2-NEXT: por %xmm0, %xmm13 -; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm6[0,2] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 @@ -70,32 +67,30 @@ ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movss %xmm13, (%rdi) +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: por %xmm9, %xmm12 ; SSE2-NEXT: por %xmm6, %xmm7 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm1, 4(%rdi) ; SSE2-NEXT: .LBB0_4: # %else2 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm0 +; SSE2-NEXT: pand %xmm12, %xmm3 ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm7 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: por %xmm12, %xmm3 ; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: .LBB0_8: # %else6 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] @@ -319,71 +314,66 @@ ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm12 -; SSE2-NEXT: por %xmm2, %xmm12 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 @@ -1296,35 +1286,32 @@ ; SSE2-LABEL: truncstore_v4i64_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax @@ -1342,21 +1329,21 @@ ; SSE2-NEXT: .LBB3_8: # %else6 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB3_1: # %cond.store -; SSE2-NEXT: movss %xmm1, (%rdi) +; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB3_4 ; SSE2-NEXT: .LBB3_3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, 4(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: movd %xmm1, 4(%rdi) ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB3_6 ; SSE2-NEXT: .LBB3_5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB3_8 ; SSE2-NEXT: .LBB3_7: # %cond.store5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: retq ; @@ -1492,37 +1479,34 @@ ; SSE2-LABEL: truncstore_v4i64_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -1775,45 +1759,43 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 -; SSE2-NEXT: movmskps %xmm10, %ecx +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -2077,9 +2059,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -2215,9 +2196,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] @@ -2364,19 +2344,18 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm4, %ecx +; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -4735,9 +4714,8 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 @@ -4961,16 +4939,15 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB14_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -755,20 +755,17 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubusw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test13: @@ -1021,20 +1018,17 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubusw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test15: @@ -1544,20 +1538,17 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubusw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_8i32_max: @@ -1620,11 +1611,9 @@ ; SSE2OR3-NEXT: pand %xmm5, %xmm7 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2OR3-NEXT: por %xmm7, %xmm5 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535] ; SSE2OR3-NEXT: pand %xmm5, %xmm2 -; SSE2OR3-NEXT: pandn %xmm10, %xmm5 -; SSE2OR3-NEXT: por %xmm2, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE2OR3-NEXT: por %xmm5, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; SSE2OR3-NEXT: movdqa %xmm1, %xmm5 ; SSE2OR3-NEXT: pxor %xmm8, %xmm5 @@ -1637,9 +1626,8 @@ ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSE2OR3-NEXT: por %xmm5, %xmm6 ; SSE2OR3-NEXT: pand %xmm6, %xmm1 -; SSE2OR3-NEXT: pandn %xmm10, %xmm6 -; SSE2OR3-NEXT: por %xmm1, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; SSE2OR3-NEXT: por %xmm6, %xmm1 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 @@ -1653,9 +1641,8 @@ ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2OR3-NEXT: por %xmm2, %xmm5 ; SSE2OR3-NEXT: pand %xmm5, %xmm4 -; SSE2OR3-NEXT: pandn %xmm10, %xmm5 -; SSE2OR3-NEXT: por %xmm4, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE2OR3-NEXT: por %xmm5, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 ; SSE2OR3-NEXT: pxor %xmm8, %xmm4 @@ -1667,9 +1654,8 @@ ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] ; SSE2OR3-NEXT: por %xmm4, %xmm5 ; SSE2OR3-NEXT: pand %xmm5, %xmm3 -; SSE2OR3-NEXT: pandn %xmm10, %xmm5 -; SSE2OR3-NEXT: por %xmm3, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE2OR3-NEXT: por %xmm5, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2OR3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] @@ -1924,20 +1910,17 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubusw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_i16_i32_max_swapped: @@ -2019,20 +2002,17 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubusw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_i16_i32_min: @@ -2608,20 +2588,17 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubusw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test32: diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -310,53 +310,51 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm9, %xmm9 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; X64-NEXT: movq %xmm2, %rcx -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; X64-NEXT: paddq %xmm2, %xmm2 -; X64-NEXT: psllq $31, %xmm2 -; X64-NEXT: movq %xmm2, %rax +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movq %xmm3, %rcx +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X64-NEXT: paddq %xmm4, %xmm4 +; X64-NEXT: psllq $31, %xmm4 +; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm6 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X64-NEXT: movq %xmm2, %rax -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: movq %xmm2, %rcx +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; X64-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: pxor %xmm10, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X64-NEXT: movq %rax, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X64-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; X64-NEXT: movdqa %xmm3, %xmm7 +; X64-NEXT: pxor %xmm9, %xmm7 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; X64-NEXT: movdqa {{.*#+}} xmm8 = [2147483649,2147483649,2147483649,2147483649] -; X64-NEXT: pcmpeqd %xmm8, %xmm7 +; X64-NEXT: pcmpeqd %xmm8, %xmm6 ; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372043297226751,9223372043297226751] ; X64-NEXT: movdqa %xmm4, %xmm5 -; X64-NEXT: pcmpgtd %xmm2, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; X64-NEXT: pand %xmm7, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; X64-NEXT: por %xmm3, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm7 = [8589934591,8589934591] -; X64-NEXT: pand %xmm2, %xmm6 -; X64-NEXT: pandn %xmm7, %xmm2 -; X64-NEXT: por %xmm6, %xmm2 -; X64-NEXT: psrlq $1, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; X64-NEXT: pcmpgtd %xmm7, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; X64-NEXT: pand %xmm6, %xmm7 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X64-NEXT: por %xmm7, %xmm5 +; X64-NEXT: pand %xmm5, %xmm3 +; X64-NEXT: por %xmm5, %xmm3 +; X64-NEXT: psrlq $1, %xmm3 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: movq %rax, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: psrlq $32, %xmm1 @@ -364,20 +362,20 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; X64-NEXT: pxor %xmm3, %xmm10 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; X64-NEXT: pxor %xmm2, %xmm9 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] ; X64-NEXT: pcmpeqd %xmm8, %xmm0 -; X64-NEXT: pcmpgtd %xmm10, %xmm4 +; X64-NEXT: pcmpgtd %xmm9, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] ; X64-NEXT: pand %xmm0, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pand %xmm0, %xmm3 -; X64-NEXT: pandn %xmm7, %xmm0 -; X64-NEXT: por %xmm3, %xmm0 -; X64-NEXT: psrlq $1, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: por %xmm0, %xmm2 +; X64-NEXT: psrlq $1, %xmm2 +; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: vec: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -883,121 +883,117 @@ ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,1,0,1] -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm13 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm0[2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm8 -; AVX1-NEXT: vpsllq $48, %xmm9, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm3 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm1[0,1,0,1] +; AVX1-NEXT: vandnps %ymm11, %ymm12, %ymm11 +; AVX1-NEXT: vorps %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vandnps %ymm2, %ymm11, %ymm2 -; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm0[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm3 +; AVX1-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] -; AVX1-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm11 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3],xmm10[4,5],xmm13[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm8[4,5],xmm9[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,2,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,0] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm12 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm13[2,3],xmm10[4,5],xmm13[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u] -; AVX1-NEXT: vpsrlq $48, %xmm14, %xmm3 +; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm15[2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm15[4,5],xmm5[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3],xmm3[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3],xmm14[4,5],xmm15[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps %ymm8, (%rsi) +; AVX1-NEXT: vmovaps %ymm10, (%rsi) ; AVX1-NEXT: vmovaps %ymm11, (%rdx) ; AVX1-NEXT: vmovaps %ymm12, (%rcx) ; AVX1-NEXT: vmovaps %ymm2, (%r8) @@ -1815,306 +1811,300 @@ ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX1-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6,7] -; AVX1-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX1-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovdqa 304(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,1,1,3] -; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] -; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm2, %xmm12 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vmovdqa %xmm0, %xmm7 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; AVX1-NEXT: vmovdqa %xmm0, %xmm4 -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; AVX1-NEXT: vmovdqa %xmm0, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-NEXT: vandnps %ymm3, %ymm8, %ymm3 -; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm15[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vpsllq $48, %xmm12, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] -; AVX1-NEXT: vmovdqa %xmm6, %xmm15 -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa %xmm5, %xmm10 -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] -; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm7[3,4,5,6,7] -; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3],xmm9[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vmovdqa %xmm0, %xmm14 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; AVX1-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm2[4],xmm15[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vandps %ymm4, %ymm12, %ymm7 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[0,1,0,1] +; AVX1-NEXT: vmovaps %xmm1, %xmm4 +; AVX1-NEXT: vandnps %ymm15, %ymm12, %ymm15 +; AVX1-NEXT: vorps %ymm7, %ymm15, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm14, %xmm15 +; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm14[2,3],xmm8[4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0,1],xmm13[2,3],mem[4,5],xmm13[6,7] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,3,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm3[4,5],xmm9[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm14 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6,7] +; AVX1-NEXT: vandps %ymm7, %ymm12, %ymm7 +; AVX1-NEXT: vpsllq $48, %xmm4, %xmm14 +; AVX1-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14 +; AVX1-NEXT: vorps %ymm7, %ymm14, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa %xmm13, %xmm14 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm13[4,5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm6 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX1-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5],mem[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm7 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpsllq $48, %xmm4, %xmm4 +; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm9[0,1,2,3],mem[4,5],xmm9[6,7] +; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[0,1,2,3],xmm15[4,5],mem[6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm10[2,3],xmm15[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpshufd $212, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX1-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = mem[0,1,2,0] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw $12, (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = mem[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX1-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm8, %xmm10 -; AVX1-NEXT: vmovdqa %xmm4, %xmm9 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm4[4,5],xmm6[6,7] -; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa %xmm14, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX1-NEXT: vmovdqa %xmm13, %xmm8 +; AVX1-NEXT: vmovdqa %xmm5, %xmm15 +; AVX1-NEXT: vmovdqa %xmm14, %xmm8 +; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm5[2,3],xmm14[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm14 +; AVX1-NEXT: vmovdqa %xmm12, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm12[4,5],xmm13[6,7] +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm11, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4],xmm14[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm13[2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,1,2,0] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 -; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm9[4,5],xmm10[6,7] -; AVX1-NEXT: vmovdqa %xmm10, %xmm13 -; AVX1-NEXT: vmovdqa %xmm9, %xmm14 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm11[2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,2,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm13[2,3],xmm5[4,5],xmm13[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm15[4,5],xmm8[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm10, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm12[2,3],xmm6[4,5],xmm12[6,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm0[2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm13[4,5],xmm11[6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = xmm3[0,1,2],mem[3],xmm3[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4,5],xmm4[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = mem[0,3,2,3] +; AVX1-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] ; AVX1-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm6[0,1,2,3],mem[4,5],xmm6[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-NEXT: # xmm4 = mem[1,1,1,1] ; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -2122,7 +2112,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -2132,15 +2122,15 @@ ; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm3, (%rdx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-NEXT: vmovaps %ymm8, (%r8) +; AVX1-NEXT: vmovaps %ymm9, (%r8) ; AVX1-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-NEXT: vmovaps %ymm1, (%r9) ; AVX1-NEXT: addq $424, %rsp # imm = 0x1A8 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -33,9 +33,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v2i64_v2i32: @@ -51,9 +50,8 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v2i64_v2i32: @@ -133,9 +131,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -152,9 +149,8 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: movq %xmm0, (%rdi) ; SSSE3-NEXT: retq ; @@ -252,7 +248,6 @@ ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: retq @@ -284,7 +279,6 @@ ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSSE3-NEXT: retq @@ -433,130 +427,120 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64>* %p0) { ; SSE2-LABEL: trunc_usat_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm9 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-NEXT: movdqa 32(%rdi), %xmm6 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i64_v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm9 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm5 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm1 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm1 ; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i32: @@ -731,9 +715,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: retq ; @@ -750,9 +733,8 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: retq ; @@ -854,9 +836,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq @@ -874,9 +855,8 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: movd %xmm0, (%rdi) ; SSSE3-NEXT: retq @@ -975,74 +955,68 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE2-LABEL: trunc_usat_v4i64_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq @@ -1148,78 +1122,72 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, <4 x i16> *%p1) { ; SSE2-LABEL: trunc_usat_v4i64_v4i16_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i16_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i16_store: @@ -1328,148 +1296,138 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64>* %p0) { ; SSE2-LABEL: trunc_usat_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm6 -; SSE2-NEXT: movdqa 48(%rdi), %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE2-NEXT: retq ; -; SSSE3-LABEL: trunc_usat_v8i64_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm4 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-LABEL: trunc_usat_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa (%rdi), %xmm2 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm2 ; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: @@ -1602,9 +1560,8 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: retq @@ -1616,7 +1573,6 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSSE3-NEXT: retq @@ -1684,9 +1640,8 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movq %xmm0, (%rdi) @@ -1699,10 +1654,9 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movq %xmm2, (%rdi) +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movq %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i32_v4i16_store: @@ -1791,25 +1745,22 @@ ; ; SSSE3-LABEL: trunc_usat_v8i32_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i32_v8i16: @@ -2044,7 +1995,6 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -2065,7 +2015,6 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq @@ -2146,13 +2095,12 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: retq ; @@ -2169,10 +2117,9 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, (%rdi) ; SSSE3-NEXT: retq ; @@ -2245,75 +2192,70 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE2-LABEL: trunc_usat_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i8: @@ -2422,77 +2364,72 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, <4 x i8> *%p1) { ; SSE2-LABEL: trunc_usat_v4i64_v4i8_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: packuswb %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: movd %xmm3, (%rdi) +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i8_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm0, %xmm2 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: movd %xmm3, (%rdi) +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: movd %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i8_store: @@ -3595,7 +3532,6 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -3609,7 +3545,6 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq @@ -3678,12 +3613,11 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i32_v4i8_store: @@ -3693,10 +3627,9 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movd %xmm2, (%rdi) +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i32_v4i8_store: