Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -28934,12 +28934,92 @@ EltNo); } +// Try to match patterns such as +// (i16 bitcast (v16i1 setcc v16i8 v1, v2, gt)) +// -> +// (movmsk (v16i8 setcc v16i8 v1, v2, gt)) +// before the setcc result is scalarized on subtargets that don't have legal +// vxi1 types. +static SDValue combineBitcastOfSetCC(SelectionDAG &DAG, SDLoc DL, EVT VT, + SDValue SetCC, const X86Subtarget &Subtarget) { + if (!VT.isScalarInteger() || SetCC.getValueType().getScalarType() != MVT::i1) + return SDValue(); + // With AVX512 vxi1 types are legal and we prefer using k-regs. + if (Subtarget.hasAVX512()) + return SDValue(); + SDValue N0 = SetCC->getOperand(0); // First operand the setcc compares + EVT InVT = N0.getValueType(); // Type of operands the setcc compares + + // 128-bit PMOVMSK requires SSE2, 256-bit PMOVMSK requires AVX2. + // There are cases (see below) where we need to shuffle the result of the + // vector compare and it may be more profitable to bail-out if only SSE2. + // SSSE3 introduces PSHUFB which makes these shuffles cheaper. + switch (InVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v4f32: + if (!Subtarget.hasSSE2()) + return SDValue(); + break; + // For SSE2/v2x64 don't apply this combine because it is faster to extract + // the two elements and do a bitwise manipulation than shuffles+pmovmsk. + case MVT::v2i64: + case MVT::v2f64: + if (!Subtarget.hasSSSE3()) + return SDValue(); + break; + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + case MVT::v32i8: + case MVT::v8f32: + case MVT::v4f64: + if (!Subtarget.hasAVX2()) + return SDValue(); + break; + }; + + SDValue V = DAG.getSetCC( + SDLoc(SetCC), InVT.changeVectorElementTypeToInteger(), N0, + SetCC->getOperand(1), cast(SetCC->getOperand(2))->get()); + + // MOVMSK's operand must be a vector of i8's. So for types with larger + // elements, shuffle the upper element bytes to a consecutive sequence at the + // start of the vector. + // For example, t0 := (v4i32 setcc (v4i32 v1, v2, gt)) needs to be shuffled + // as: + // (v16i8 shuffle <0,4,8,12,u,u,...,u> (v16i8 bitcast t0), undef) + if (InVT != MVT::v16i8 && InVT != MVT::v32i8) { + unsigned Stride = InVT.getScalarSizeInBits() / 8; + MVT WideMaskVT = InVT.getSizeInBits() == 128 ? MVT::v16i8 : MVT::v32i8; + SmallVector Mask(WideMaskVT.getVectorNumElements(), -1); + for (unsigned i = 0, e = InVT.getVectorNumElements(); i != e; ++i) + Mask[i] = Stride * i; + V = DAG.getBitcast(WideMaskVT, V); + V = DAG.getVectorShuffle(WideMaskVT, DL, V, DAG.getUNDEF(WideMaskVT), Mask); + } + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); + return DAG.getZExtOrTrunc(V, DL, VT); +} + static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + const X86Subtarget &Subtarget, bool LegalTypes) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); + // Try to match patterns such as + // (i16 bitcast (v16i1 setcc v16i8 v1, v2, gt)) + // -> + // (movmsk (v16i8 setcc v16i8 v1, v2, gt)) + // before the setcc result is scalarized on subtargets that don't have legal + // vxi1 types. + if (!LegalTypes && N0.getOpcode() == ISD::SETCC) + if (SDValue V = combineBitcastOfSetCC(DAG, SDLoc(N), VT, N0, Subtarget)) + return V; // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -35044,7 +35124,7 @@ case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); + case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget, !DCI.isBeforeLegalize()); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -8,91 +8,26 @@ ; SSE2-LABEL: v8i16: ; SSE2: ## BB#0: ; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pextrw $6, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pextrw $5, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pextrw $4, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i16: ; SSSE3: ## BB#0: ; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v8i16: ; AVX1: ## BB#0: ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX512-LABEL: v8i16: @@ -110,61 +45,27 @@ ; SSE2-LABEL: v4i32: ; SSE2: ## BB#0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: ## BB#0: ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v4i32: ; AVX1: ## BB#0: ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX512-LABEL: v4i32: @@ -183,63 +84,27 @@ ; SSE2-LABEL: v4f32: ; SSE2: ## BB#0: ; SSE2-NEXT: cmpltps %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4f32: ; SSSE3: ## BB#0: ; SSSE3-NEXT: cmpltps %xmm0, %xmm1 -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm1, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v4f32: ; AVX1: ## BB#0: ; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vextractps $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $0, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX512-LABEL: v4f32: @@ -258,165 +123,22 @@ ; SSE2-LABEL: v16i8: ; SSE2: ## BB#0: ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-NEXT: andb $1, %cl -; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: andb $1, %al -; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AX %AX %EAX ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i8: ; SSSE3: ## BB#0: ; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSSE3-NEXT: andb $1, %cl -; SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andb $1, %al -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AX %AX %EAX ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v16i8: ; AVX1: ## BB#0: ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AX %AX %EAX ; AVX1-NEXT: retq ; ; AVX512-LABEL: v16i8: @@ -467,26 +189,17 @@ ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm1, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i64: ; AVX1: ## BB#0: ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX512-LABEL: v2i64: @@ -518,26 +231,17 @@ ; SSSE3-LABEL: v2f64: ; SSSE3: ## BB#0: ; SSSE3-NEXT: cmpltpd %xmm0, %xmm1 -; SSSE3-NEXT: movq %xmm1, %rax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm1, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v2f64: ; AVX1: ## BB#0: ; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX512-LABEL: v2f64: Index: test/CodeGen/X86/bitcast-setcc-256.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-256.ll +++ test/CodeGen/X86/bitcast-setcc-256.ll @@ -7,56 +7,12 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: ## kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -77,32 +33,12 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -123,32 +59,12 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -167,117 +83,8 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -297,20 +104,12 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -332,20 +131,12 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ;