Index: llvm/trunk/include/llvm/Target/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetLowering.h +++ llvm/trunk/include/llvm/Target/TargetLowering.h @@ -637,6 +637,14 @@ getTruncStoreAction(ValVT.getSimpleVT(), MemVT.getSimpleVT()) == Legal; } + /// Return true if the specified store with truncation has solution on this + /// target. + bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const { + return isTypeLegal(ValVT) && MemVT.isSimple() && + (getTruncStoreAction(ValVT.getSimpleVT(), MemVT.getSimpleVT()) == Legal || + getTruncStoreAction(ValVT.getSimpleVT(), MemVT.getSimpleVT()) == Custom); + } + /// Return how the indexed load should be treated: either it is legal, needs /// to be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1394,6 +1394,7 @@ setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom); setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom); + setTruncStoreAction(VT, MaskVT, Custom); } setOperationAction(ISD::FADD, MVT::v16f32, Legal); setOperationAction(ISD::FSUB, MVT::v16f32, Legal); @@ -16106,6 +16107,65 @@ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +// Lower truncating store. We need a special lowering to vXi1 vectors +static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + StoreSDNode *St = cast(StOp.getNode()); + SDLoc dl(St); + EVT MemVT = St->getMemoryVT(); + assert(St->isTruncatingStore() && "We only custom truncating store."); + assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && + "Expected truncstore of i1 vector"); + + SDValue Op = St->getValue(); + MVT OpVT = Op.getValueType().getSimpleVT(); + unsigned NumElts = OpVT.getVectorNumElements(); + if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + NumElts == 16) { + // Truncate and store - everything is legal + Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op); + if (MemVT.getSizeInBits() < 8) + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), Op, + DAG.getIntPtrConstant(0, dl)); + return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), + St->getMemOperand()); + } + + // A subset, assume that we have only AVX-512F + if (NumElts <= 8) { + if (NumElts < 8) { + // Extend to 8-elts vector + MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8); + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT, + DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl)); + } + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op); + return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), + St->getMemOperand()); + } + // v32i8 + assert(OpVT == MVT::v32i8 && "Unexpected operand type"); + // Divide the vector into 2 parts and store each part separately + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, + DAG.getIntPtrConstant(0, dl)); + Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo); + SDValue BasePtr = St->getBasePtr(); + SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr, + St->getMemOperand()); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, + DAG.getIntPtrConstant(16, dl)); + Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi); + + SDValue BasePtrHi = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(2, dl, BasePtr.getValueType())); + + SDValue StHi = DAG.getStore(St->getChain(), dl, Hi, + BasePtrHi, St->getMemOperand()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi); +} + static SDValue LowerExtended1BitVectorLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -21444,6 +21504,7 @@ case ISD::GC_TRANSITION_START: return LowerGC_TRANSITION_START(Op, DAG); case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); + case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG); } } @@ -28021,7 +28082,7 @@ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. - if (TLI.isTruncStoreLegal(VT, StVT)) + if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) return SDValue(); // From, To sizes and ElemCount must be pow of two Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll @@ -521,17 +521,9 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) { ; KNL-LABEL: test22: ; KNL: ## BB#0: -; KNL-NEXT: vpextrd $3, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vpextrd $2, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vpextrd $1, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vmovd %xmm0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpslld $31, %ymm0, %ymm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: retq ; @@ -548,11 +540,9 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) { ; KNL-LABEL: test23: ; KNL: ## BB#0: -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: retq ; @@ -596,11 +586,9 @@ ; KNL-LABEL: store_v2i1: ; KNL: ## BB#0: ; KNL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: retq ; @@ -621,17 +609,9 @@ ; KNL: ## BB#0: ; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 ; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpextrd $3, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vpextrd $2, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vpextrd $1, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movb %al, (%rdi) -; KNL-NEXT: vmovd %xmm0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpslld $31, %ymm0, %ymm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: retq ; @@ -1038,3 +1018,128 @@ %c = sext <64 x i1> %b to <64 x i8> ret <64 x i8> %c } + +define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) { +; KNL-LABEL: store_8i1: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: store_8i1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k0 +; SKX-NEXT: kmovb %k0, (%rdi) +; SKX-NEXT: retq + store <8 x i1> %v, <8 x i1>* %a + ret void +} + +define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) { +; KNL-LABEL: store_8i1_1: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: store_8i1_1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 +; SKX-NEXT: vpmovw2m %xmm0, %k0 +; SKX-NEXT: kmovb %k0, (%rdi) +; SKX-NEXT: retq + %v1 = trunc <8 x i16> %v to <8 x i1> + store <8 x i1> %v1, <8 x i1>* %a + ret void +} + +define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) { +; KNL-LABEL: store_16i1: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: store_16i1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 +; SKX-NEXT: vpmovb2m %xmm0, %k0 +; SKX-NEXT: kmovw %k0, (%rdi) +; SKX-NEXT: retq + store <16 x i1> %v, <16 x i1>* %a + ret void +} + +define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) { +; KNL-LABEL: store_32i1: +; KNL: ## BB#0: +; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: store_32i1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 +; SKX-NEXT: vpmovb2m %ymm0, %k0 +; SKX-NEXT: kmovd %k0, (%rdi) +; SKX-NEXT: retq + store <32 x i1> %v, <32 x i1>* %a + ret void +} + +define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { +; KNL-LABEL: store_32i1_1: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: store_32i1_1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 +; SKX-NEXT: vpmovw2m %zmm0, %k0 +; SKX-NEXT: kmovd %k0, (%rdi) +; SKX-NEXT: retq + %v1 = trunc <32 x i16> %v to <32 x i1> + store <32 x i1> %v1, <32 x i1>* %a + ret void +} + + +define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { +; +; SKX-LABEL: store_64i1: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: kmovq %k0, (%rdi) +; SKX-NEXT: retq + store <64 x i1> %v, <64 x i1>* %a + ret void +} Index: llvm/trunk/test/CodeGen/X86/vector-compare-results.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-compare-results.ll +++ llvm/trunk/test/CodeGen/X86/vector-compare-results.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 @@ -6569,393 +6570,41 @@ ; AVX512-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2 ; AVX512-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpextrb $15, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $14, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $13, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $12, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $11, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $10, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $9, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $8, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $7, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $6, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $5, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $4, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $3, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $2, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $1, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $0, %xmm4, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 12(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm4, %zmm4 +; AVX512-NEXT: vpslld $31, %zmm4, %zmm4 +; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512-NEXT: kmovw %k0, 14(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512-NEXT: kmovw %k0, 12(%rdi) ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $0, %xmm3, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $15, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $14, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $13, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $12, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $11, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $10, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $9, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $8, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $7, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $6, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $5, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $4, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $3, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $2, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $1, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) -; AVX512-NEXT: vpextrb $0, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 8(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512-NEXT: kmovw %k0, 10(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512-NEXT: kmovw %k0, 8(%rdi) ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrb $15, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $14, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $13, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $12, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $11, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $10, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $9, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $8, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $7, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $6, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $5, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $4, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $3, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $2, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $1, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $0, %xmm2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $15, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $14, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $13, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $12, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $11, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $10, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $9, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $8, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $7, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $6, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $5, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $4, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $3, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $2, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $1, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) -; AVX512-NEXT: vpextrb $0, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, 4(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512-NEXT: kmovw %k0, 6(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512-NEXT: kmovw %k0, 4(%rdi) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpextrb $15, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $14, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $13, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $12, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $11, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $10, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $9, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $8, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $7, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $6, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $5, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $4, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $3, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $2, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $1, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $0, %xmm1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $15, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $14, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $13, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $12, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $11, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $10, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $9, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $8, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $7, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $6, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $5, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $4, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $3, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $2, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $1, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512-NEXT: kmovw %k0, 2(%rdi) +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512-NEXT: kmovw %k0, (%rdi) ; AVX512-NEXT: movq %rdi, %rax ; AVX512-NEXT: retq %1 = icmp sgt <128 x i8> %a0, %a1