Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -586,14 +586,15 @@ } SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) { - EVT SVT = getSetCCResultType(N->getOperand(0).getValueType()); + EVT InVT = N->getOperand(0).getValueType(); - EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + // If we need to promote the operands, use their promoted type for the + // getSetCCResultType query. Otherwise, we might get back a type that + // still needs to be promoted. + if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) + InVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); - // Only use the result of getSetCCResultType if it is legal, - // otherwise just use the promoted result type (NVT). - if (!TLI.isTypeLegal(SVT)) - SVT = NVT; + EVT SVT = getSetCCResultType(InVT); SDLoc dl(N); assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() && @@ -604,6 +605,7 @@ N->getOperand(1), N->getOperand(2)); // Convert to the expected type. + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getSExtOrTrunc(SetCC, dl, NVT); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17943,24 +17943,6 @@ if (VTOp0 == MVT::v2i32) return SDValue(); - if (VT.is128BitVector() && VTOp0.is256BitVector()) { - // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type - // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the - // legalizer firstly checks if the first operand in input to the setcc has - // a legal type. If so, then it promotes the return type to that same type. - // Otherwise, the return type is promoted to the 'next legal type' which, - // for a vector of MVT::i1 is always a 128-bit integer vector type. - // - // We reach this code only if the following two conditions are met: - // 1. Both return type and operand type have been promoted to wider types - // by the type legalizer. - // 2. The original operand type has been promoted to a 256-bit vector. - // - // Note that condition 2. only applies for AVX targets. - SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond); - return DAG.getZExtOrTrunc(NewOp, dl, VT); - } - // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -465,44 +465,40 @@ ; CHECK-NEXT: add r11, sp, #8 ; CHECK-NEXT: bic sp, sp, #15 ; CHECK-NEXT: add r12, r11, #32 -; CHECK-NEXT: add lr, r11, #60 +; CHECK-NEXT: add lr, r11, #44 ; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] ; CHECK-NEXT: add r12, r11, #24 -; CHECK-NEXT: vld1.32 {d22[0]}, [lr:32] -; CHECK-NEXT: add lr, r11, #36 ; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] ; CHECK-NEXT: add r12, r11, #52 ; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] -; CHECK-NEXT: add r12, r11, #44 -; CHECK-NEXT: vld1.32 {d17[1]}, [lr:32] -; CHECK-NEXT: vld1.32 {d18[0]}, [r12:32] -; CHECK-NEXT: add r12, r11, #40 -; CHECK-NEXT: vld1.32 {d20[0]}, [r12:32] ; CHECK-NEXT: ldr r12, [r11, #64] -; CHECK-NEXT: vcgt.u32 q10, q11, q10 +; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] +; CHECK-NEXT: add lr, r11, #40 +; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] ; CHECK-NEXT: ldr r4, [r12] -; CHECK-NEXT: vmov.32 d25[0], r4 +; CHECK-NEXT: vmov.32 d23[0], r4 +; CHECK-NEXT: add r4, r11, #60 +; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] +; CHECK-NEXT: add r4, r11, #36 +; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] ; CHECK-NEXT: add r4, r11, #28 +; CHECK-NEXT: vcgt.u32 q10, q12, q10 +; CHECK-NEXT: vmov.u8 lr, d23[3] ; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] ; CHECK-NEXT: add r4, r11, #56 ; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] ; CHECK-NEXT: add r4, r11, #48 -; CHECK-NEXT: vmov.u8 lr, d25[3] ; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] ; CHECK-NEXT: add r4, r12, #4 ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vmovn.i32 d19, q10 ; CHECK-NEXT: vldr d20, .LCPI23_0 -; CHECK-NEXT: vmov.i8 d18, #0x7 -; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vneg.s8 d17, d18 -; CHECK-NEXT: vuzp.8 d16, d19 +; CHECK-NEXT: vmovn.i32 d18, q8 +; CHECK-NEXT: vmovn.i16 d22, q9 ; CHECK-NEXT: vmov.i8 q9, #0x7 -; CHECK-NEXT: vshl.i8 d16, d16, #7 -; CHECK-NEXT: vneg.s8 q9, q9 -; CHECK-NEXT: vshl.s8 d24, d16, d17 ; CHECK-NEXT: vmov.8 d17[0], lr -; CHECK-NEXT: vtbl.8 d16, {d24, d25}, d20 +; CHECK-NEXT: vneg.s8 q9, q9 +; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d20 ; CHECK-NEXT: vld1.8 {d17[1]}, [r4] ; CHECK-NEXT: add r4, r11, #8 ; CHECK-NEXT: vshl.i8 q8, q8, #7 Index: test/CodeGen/X86/bitcast-and-setcc-512.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-512.ll +++ test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -13,22 +13,38 @@ ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtq %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pcmpgtq %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pcmpgtq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm11[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[0,2] -; SSE-NEXT: packssdw %xmm10, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: packsswb %xmm0, %xmm8 -; SSE-NEXT: pmovmskb %xmm8, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: packsswb %xmm0, %xmm3 +; SSE-NEXT: pmovmskb %xmm3, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -38,30 +54,27 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 ; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9 +; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; AVX1-NEXT: vpackssdw %xmm8, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm2 ; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -73,23 +86,20 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -128,22 +138,38 @@ ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltpd %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,1,0,2,4,5,6,7] ; SSE-NEXT: cmpltpd %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: cmpltpd %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: cmpltpd %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] -; SSE-NEXT: packssdw %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,0,2,4,5,6,7] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm11[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[0,2] -; SSE-NEXT: packssdw %xmm10, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: packsswb %xmm0, %xmm8 -; SSE-NEXT: pmovmskb %xmm8, %eax +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: packsswb %xmm0, %xmm3 +; SSE-NEXT: pmovmskb %xmm3, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -152,23 +178,20 @@ ; AVX12-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 ; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX12-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX12-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 ; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX12-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vcmpltpd %ymm5, %ymm7, %ymm1 ; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX12-NEXT: vcmpltpd %ymm4, %ymm6, %ymm2 -; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX12-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 -; AVX12-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX12-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX12-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX12-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -317,23 +340,33 @@ ; SSE-LABEL: v16i32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = +; SSE-NEXT: pshufb %xmm7, %xmm3 ; SSE-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pshufb %xmm7, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm3, %xmm1 ; SSE-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pshufb %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: pshufb %xmm7, %xmm11 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufb %xmm7, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pshufb %xmm3, %xmm10 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: packssdw %xmm9, %xmm8 -; SSE-NEXT: packsswb %xmm10, %xmm8 +; SSE-NEXT: pshufb %xmm3, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax @@ -419,23 +452,33 @@ ; SSE-LABEL: v16f32: ; SSE: # %bb.0: ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltps %xmm3, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = +; SSE-NEXT: pshufb %xmm3, %xmm7 ; SSE-NEXT: cmpltps %xmm2, %xmm6 -; SSE-NEXT: packssdw %xmm7, %xmm6 +; SSE-NEXT: pshufb %xmm3, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE-NEXT: cmpltps %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm1, %xmm5 ; SSE-NEXT: cmpltps %xmm0, %xmm4 -; SSE-NEXT: packssdw %xmm5, %xmm4 -; SSE-NEXT: packsswb %xmm6, %xmm4 +; SSE-NEXT: pshufb %xmm1, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: pshufb %xmm3, %xmm11 ; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufb %xmm3, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pshufb %xmm1, %xmm10 ; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: packssdw %xmm9, %xmm8 -; SSE-NEXT: packsswb %xmm10, %xmm8 +; SSE-NEXT: pshufb %xmm1, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax Index: test/CodeGen/X86/vector-compare-results.ll =================================================================== --- test/CodeGen/X86/vector-compare-results.ll +++ test/CodeGen/X86/vector-compare-results.ll @@ -2403,19 +2403,28 @@ ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 ; SSE2-NEXT: cmpltps %xmm3, %xmm15 +; SSE2-NEXT: movaps {{.*#+}} xmm3 = [255,255,255,255] +; SSE2-NEXT: andps %xmm3, %xmm15 ; SSE2-NEXT: cmpltps %xmm2, %xmm14 -; SSE2-NEXT: packssdw %xmm15, %xmm14 +; SSE2-NEXT: andps %xmm3, %xmm14 +; SSE2-NEXT: packuswb %xmm15, %xmm14 ; SSE2-NEXT: cmpltps %xmm1, %xmm13 +; SSE2-NEXT: andps %xmm3, %xmm13 ; SSE2-NEXT: cmpltps %xmm0, %xmm8 -; SSE2-NEXT: packssdw %xmm13, %xmm8 -; SSE2-NEXT: packsswb %xmm14, %xmm8 +; SSE2-NEXT: andps %xmm3, %xmm8 +; SSE2-NEXT: packuswb %xmm13, %xmm8 +; SSE2-NEXT: packuswb %xmm14, %xmm8 ; SSE2-NEXT: cmpltps %xmm7, %xmm12 +; SSE2-NEXT: andps %xmm3, %xmm12 ; SSE2-NEXT: cmpltps %xmm6, %xmm10 -; SSE2-NEXT: packssdw %xmm12, %xmm10 +; SSE2-NEXT: andps %xmm3, %xmm10 +; SSE2-NEXT: packuswb %xmm12, %xmm10 ; SSE2-NEXT: cmpltps %xmm5, %xmm11 +; SSE2-NEXT: andps %xmm3, %xmm11 ; SSE2-NEXT: cmpltps %xmm4, %xmm9 -; SSE2-NEXT: packssdw %xmm11, %xmm9 -; SSE2-NEXT: packsswb %xmm10, %xmm9 +; SSE2-NEXT: andps %xmm3, %xmm9 +; SSE2-NEXT: packuswb %xmm11, %xmm9 +; SSE2-NEXT: packuswb %xmm10, %xmm9 ; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $1, %eax @@ -2946,19 +2955,28 @@ ; SSE2-LABEL: test_cmp_v32i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: pand %xmm8, %xmm3 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packsswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: packuswb %xmm7, %xmm6 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: packuswb %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $1, %eax @@ -6464,52 +6482,89 @@ ; SSE2-LABEL: test_cmp_v32f64: ; SSE2: # %bb.0: ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: cmpltpd %xmm7, %xmm8 -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: cmpltpd %xmm6, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm8[0,2] -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: cmpltpd %xmm5, %xmm6 -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: cmpltpd %xmm4, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE2-NEXT: packssdw %xmm7, %xmm5 -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: cmpltpd %xmm3, %xmm4 -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: cmpltpd %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: cmpltpd %xmm1, %xmm2 +; SSE2-NEXT: cmpltpd %xmm1, %xmm8 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: cmpltpd %xmm0, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm9 = [255,255] +; SSE2-NEXT: andpd %xmm9, %xmm8 +; SSE2-NEXT: andpd %xmm9, %xmm1 +; SSE2-NEXT: packuswb %xmm8, %xmm1 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: cmpltpd %xmm3, %xmm0 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: packsswb %xmm5, %xmm1 -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: cmpltpd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm9, %xmm0 +; SSE2-NEXT: andpd %xmm9, %xmm3 +; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: cmpltpd %xmm7, %xmm0 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE2-NEXT: cmpltpd %xmm6, %xmm3 +; SSE2-NEXT: andpd %xmm9, %xmm0 +; SSE2-NEXT: andpd %xmm9, %xmm3 +; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: cmpltpd %xmm5, %xmm0 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: cmpltpd %xmm4, %xmm6 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: andpd %xmm9, %xmm0 +; SSE2-NEXT: andpd %xmm9, %xmm6 +; SSE2-NEXT: packuswb %xmm0, %xmm6 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm6 +; SSE2-NEXT: packuswb %xmm6, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: andpd %xmm9, %xmm0 ; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: andpd %xmm9, %xmm3 +; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSE2-NEXT: packssdw %xmm4, %xmm0 -; SSE2-NEXT: packsswb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: andpd %xmm9, %xmm0 +; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: andpd %xmm9, %xmm5 +; SSE2-NEXT: packuswb %xmm0, %xmm5 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: andpd %xmm9, %xmm0 +; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: andpd %xmm9, %xmm1 +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: andpd %xmm9, %xmm0 +; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: andpd %xmm9, %xmm2 +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE2-NEXT: movapd %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx @@ -6569,7 +6624,7 @@ ; SSE2-NEXT: orl %ecx, %edx ; SSE2-NEXT: orl %eax, %edx ; SSE2-NEXT: movw %dx, 2(%rdi) -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movapd %xmm4, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx @@ -6944,233 +6999,241 @@ ; SSE2-LABEL: test_cmp_v32i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] -; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm9[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255] +; SSE2-NEXT: pand %xmm10, %xmm9 ; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pand %xmm9, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm10, %xmm11 +; SSE2-NEXT: packuswb %xmm9, %xmm11 +; SSE2-NEXT: packuswb %xmm11, %xmm11 +; SSE2-NEXT: packuswb %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm10, %xmm3 +; SSE2-NEXT: packuswb %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: packuswb %xmm1, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,2] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: packuswb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx @@ -7230,7 +7293,7 @@ ; SSE2-NEXT: orl %ecx, %edx ; SSE2-NEXT: orl %eax, %edx ; SSE2-NEXT: movw %dx, 2(%rdi) -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movapd %xmm3, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx