diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23870,17 +23870,22 @@ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { - assert(StoredVal.getValueType().getVectorNumElements() <= 8 && - "Unexpected VT"); + unsigned NumElts = StoredVal.getValueType().getVectorNumElements(); + assert(NumElts <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); + // We must pad with zeros to ensure we store zeroes to any unused bits. StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), StoredVal, DAG.getIntPtrConstant(0, dl)); StoredVal = DAG.getBitcast(MVT::i16, StoredVal); StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); + // Make sure we store zeros in the extra bits. + if (NumElts < 8) + StoredVal = DAG.getZeroExtendInReg(StoredVal, dl, + MVT::getIntegerVT(NumElts)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), @@ -44971,17 +44976,21 @@ if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR && StoredVal.getOperand(0).getValueType() == MVT::i8) { - return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0), + SDValue Val = StoredVal.getOperand(0); + // We must store zeros to the unused bits. + Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); + return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // Widen v2i1/v4i1 stores to v8i1. - if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && + if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && Subtarget.hasAVX512()) { unsigned NumConcats = 8 / VT.getVectorNumElements(); - SmallVector Ops(NumConcats, DAG.getUNDEF(VT)); + // We must store zeros to the unused bits. + SmallVector Ops(NumConcats, DAG.getConstant(0, dl, VT)); Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2921,9 +2921,6 @@ // Load/store kreg let Predicates = [HasDQI] in { - def : Pat<(store VK1:$src, addr:$dst), - (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; - def : Pat<(v1i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>; def : Pat<(v2i1 (load addr:$src)), diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -593,6 +593,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -600,6 +602,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -619,6 +623,8 @@ ; AVX512-NEXT: cmovel %ecx, %eax ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -632,6 +638,8 @@ ; AVX512NOTDQ-NEXT: cmovel %ecx, %eax ; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -649,6 +657,8 @@ ; AVX512-NEXT: cmovel %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -660,6 +670,8 @@ ; AVX512NOTDQ-NEXT: cmovel %eax, %ecx ; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -673,6 +685,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -680,6 +694,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -693,6 +709,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $3, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -700,6 +718,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -713,6 +733,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -720,6 +742,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -760,6 +784,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 ; AVX512-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -767,6 +793,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -807,6 +835,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw (%rdi), %k0 ; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -814,6 +844,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -881,6 +913,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovw (%rdi), %k0 ; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -888,6 +922,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -955,6 +991,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -962,6 +1000,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -1056,6 +1096,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $31, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -1063,6 +1105,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -1160,6 +1204,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -1167,6 +1213,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq @@ -1286,6 +1334,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $63, %k0, %k0 +; AVX512-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -1293,6 +1343,8 @@ ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll --- a/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll +++ b/llvm/test/CodeGen/X86/avx512-load-trunc-store-i1.ll @@ -5,13 +5,18 @@ define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i2_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i2_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i2>, <1 x i2>* %a0 @@ -22,13 +27,18 @@ define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i3_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i3_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i3>, <1 x i3>* %a0 @@ -39,13 +49,18 @@ define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i4_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i4_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i4>, <1 x i4>* %a0 @@ -56,13 +71,18 @@ define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i8_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i8_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i8>, <1 x i8>* %a0 @@ -73,13 +93,18 @@ define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i16_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i16_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i16>, <1 x i16>* %a0 @@ -90,13 +115,18 @@ define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i32_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i32_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i32>, <1 x i32>* %a0 @@ -107,13 +137,18 @@ define void @load_v1i64_trunc_v1i1_store(<1 x i64>* %a0,<1 x i1>* %a1) { ; AVX512-ALL-LABEL: load_v1i64_trunc_v1i1_store: ; AVX512-ALL: # %bb.0: -; AVX512-ALL-NEXT: movb (%rdi), %al -; AVX512-ALL-NEXT: movb %al, (%rsi) +; AVX512-ALL-NEXT: kmovb (%rdi), %k0 +; AVX512-ALL-NEXT: kshiftlb $7, %k0, %k0 +; AVX512-ALL-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-ALL-NEXT: kmovb %k0, (%rsi) ; AVX512-ALL-NEXT: retq ; ; AVX512-ONLY-LABEL: load_v1i64_trunc_v1i1_store: ; AVX512-ONLY: # %bb.0: ; AVX512-ONLY-NEXT: movb (%rdi), %al +; AVX512-ONLY-NEXT: andl $1, %eax +; AVX512-ONLY-NEXT: kmovw %eax, %k0 +; AVX512-ONLY-NEXT: kmovw %k0, %eax ; AVX512-ONLY-NEXT: movb %al, (%rsi) ; AVX512-ONLY-NEXT: retq %d0 = load <1 x i64>, <1 x i64>* %a0 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1455,6 +1455,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1471,6 +1473,8 @@ ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1480,6 +1484,8 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1500,6 +1506,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1516,6 +1524,8 @@ ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1525,6 +1535,8 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1545,6 +1557,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rsi) ; KNL-NEXT: retq @@ -1553,6 +1567,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rsi) ; SKX-NEXT: retq ; @@ -1560,6 +1576,8 @@ ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rsi) ; AVX512BW-NEXT: retq @@ -1568,6 +1586,8 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k0 ; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rsi) ; AVX512DQ-NEXT: retq ; @@ -1576,6 +1596,8 @@ ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: knotw %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, (%eax) ; X86-NEXT: retl %x = xor <1 x i1> %c, @@ -1588,6 +1610,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1598,6 +1622,8 @@ ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 ; SKX-NEXT: vpmovq2m %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq ; @@ -1605,6 +1631,8 @@ ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1615,6 +1643,8 @@ ; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovq2m %zmm0, %k0 ; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1625,6 +1655,8 @@ ; X86-NEXT: vpmovq2m %xmm0, %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: knotw %k0, %k0 +; X86-NEXT: kshiftlb $6, %k0, %k0 +; X86-NEXT: kshiftrb $6, %k0, %k0 ; X86-NEXT: kmovb %k0, (%eax) ; X86-NEXT: retl %x = xor <2 x i1> %c, @@ -1637,6 +1669,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1647,6 +1681,8 @@ ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vpmovd2m %xmm0, %k0 ; SKX-NEXT: knotw %k0, %k0 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq ; @@ -1654,6 +1690,8 @@ ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1664,6 +1702,8 @@ ; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1674,6 +1714,8 @@ ; X86-NEXT: vpmovd2m %xmm0, %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: knotw %k0, %k0 +; X86-NEXT: kshiftlb $4, %k0, %k0 +; X86-NEXT: kshiftrb $4, %k0, %k0 ; X86-NEXT: kmovb %k0, (%eax) ; X86-NEXT: retl %x = xor <4 x i1> %c, @@ -5206,6 +5248,8 @@ ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5220,6 +5264,8 @@ ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SKX-NEXT: retq @@ -5229,6 +5275,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5243,6 +5291,8 @@ ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: retq @@ -5260,6 +5310,8 @@ ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp) ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: popl %ecx @@ -5277,6 +5329,8 @@ ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5291,6 +5345,8 @@ ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SKX-NEXT: retq @@ -5300,6 +5356,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5314,6 +5372,8 @@ ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: retq @@ -5331,6 +5391,8 @@ ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp) ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: popl %ecx @@ -5348,6 +5410,8 @@ ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5362,6 +5426,8 @@ ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SKX-NEXT: retq @@ -5371,6 +5437,8 @@ ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -5385,6 +5453,8 @@ ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k0 ; AVX512DQ-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 ; AVX512DQ-NEXT: kandw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $7, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: movb -{{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: retq @@ -5402,6 +5472,8 @@ ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kandw %k1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k0, %k0 +; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: kmovb %k0, {{[0-9]+}}(%esp) ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: popl %ecx diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -552,6 +552,8 @@ ; X86-AVX512F-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512F-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512F-NEXT: korw %k1, %k0, %k0 +; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512F-NEXT: kmovw %k0, %eax ; X86-AVX512F-NEXT: movb %al, (%edx) ; X86-AVX512F-NEXT: popl %esi @@ -568,6 +570,8 @@ ; X64-AVX512F-NEXT: kandnw %k1, %k2, %k1 ; X64-AVX512F-NEXT: kandw %k2, %k0, %k0 ; X64-AVX512F-NEXT: korw %k1, %k0, %k0 +; X64-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512F-NEXT: kmovw %k0, %eax ; X64-AVX512F-NEXT: movb %al, (%rsi) ; X64-AVX512F-NEXT: retq @@ -587,6 +591,8 @@ ; X86-AVX512BW-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512BW-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 +; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512BW-NEXT: kmovd %k0, %eax ; X86-AVX512BW-NEXT: movb %al, (%edx) ; X86-AVX512BW-NEXT: popl %esi @@ -603,6 +609,8 @@ ; X64-AVX512BW-NEXT: kandnw %k1, %k2, %k1 ; X64-AVX512BW-NEXT: kandw %k2, %k0, %k0 ; X64-AVX512BW-NEXT: korw %k1, %k0, %k0 +; X64-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512BW-NEXT: kmovd %k0, %eax ; X64-AVX512BW-NEXT: movb %al, (%rsi) ; X64-AVX512BW-NEXT: retq @@ -634,6 +642,8 @@ ; X86-AVX512F-NEXT: movzbl (%eax), %ecx ; X86-AVX512F-NEXT: kmovw %ecx, %k0 ; X86-AVX512F-NEXT: .LBB18_3: +; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512F-NEXT: kmovw %k0, %ecx ; X86-AVX512F-NEXT: movb %cl, (%eax) ; X86-AVX512F-NEXT: retl @@ -653,6 +663,8 @@ ; X64-AVX512F-NEXT: movzbl (%rsi), %eax ; X64-AVX512F-NEXT: kmovw %eax, %k0 ; X64-AVX512F-NEXT: .LBB18_3: +; X64-AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512F-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512F-NEXT: kmovw %k0, %eax ; X64-AVX512F-NEXT: movb %al, (%rsi) ; X64-AVX512F-NEXT: retq @@ -675,6 +687,8 @@ ; X86-AVX512BW-NEXT: movzbl (%eax), %ecx ; X86-AVX512BW-NEXT: kmovd %ecx, %k0 ; X86-AVX512BW-NEXT: .LBB18_3: +; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X86-AVX512BW-NEXT: kmovd %k0, %ecx ; X86-AVX512BW-NEXT: movb %cl, (%eax) ; X86-AVX512BW-NEXT: retl @@ -694,6 +708,8 @@ ; X64-AVX512BW-NEXT: movzbl (%rsi), %eax ; X64-AVX512BW-NEXT: kmovd %eax, %k0 ; X64-AVX512BW-NEXT: .LBB18_3: +; X64-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; X64-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; X64-AVX512BW-NEXT: kmovd %k0, %eax ; X64-AVX512BW-NEXT: movb %al, (%rsi) ; X64-AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll --- a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll @@ -89,9 +89,13 @@ ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k0, %k2 +; CHECK-NEXT: kshiftrw $12, %k2, %k2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: movb %al, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $12, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -115,9 +119,13 @@ ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k0, %k2 +; CHECK-NEXT: kshiftrw $12, %k2, %k2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: movb %al, (%rsi) +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $12, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -140,9 +148,13 @@ ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, (%rsi) +; CHECK-NEXT: kshiftlw $14, %k0, %k2 +; CHECK-NEXT: kshiftrw $14, %k2, %k2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: movb %al, (%rsi) +; CHECK-NEXT: kshiftlw $14, %k1, %k0 +; CHECK-NEXT: kshiftrw $14, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -1131,6 +1131,8 @@ ; AVX512-NEXT: kxorw %k2, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3559,64 +3559,65 @@ ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %k1, %r9d -; AVX512-NEXT: andb $1, %r9b -; AVX512-NEXT: negb %r9b +; AVX512-NEXT: kmovd %k1, %r10d +; AVX512-NEXT: andb $1, %r10b +; AVX512-NEXT: negb %r10b ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %r10d -; AVX512-NEXT: andb $1, %r10b -; AVX512-NEXT: negb %r10b +; AVX512-NEXT: kmovd %k2, %r9d +; AVX512-NEXT: andb $1, %r9b +; AVX512-NEXT: negb %r9b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %r11d -; AVX512-NEXT: andb $1, %r11b -; AVX512-NEXT: negb %r11b +; AVX512-NEXT: kmovd %k2, %ebp +; AVX512-NEXT: andb $1, %bpl +; AVX512-NEXT: negb %bpl ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: negb %bl +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl ; AVX512-NEXT: kshiftrw $1, %k0, %k2 ; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil ; AVX512-NEXT: kshiftrw $1, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl ; AVX512-NEXT: kmovd %k1, %eax ; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: negb %al -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl +; AVX512-NEXT: kmovd %k0, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %cl -; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: imulb %bl +; AVX512-NEXT: movl %eax, %r11d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: cmpb %r8b, %cl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: movl %r11d, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: cmpb %r11b, %bl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: movw $-3, %ax ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kandw %k0, %k1, %k1 -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: imulb %sil -; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: movl %r8d, %ecx ; AVX512-NEXT: andb $1, %cl ; AVX512-NEXT: negb %cl -; AVX512-NEXT: cmpb %dl, %cl +; AVX512-NEXT: cmpb %r8b, %cl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al @@ -3627,8 +3628,8 @@ ; AVX512-NEXT: movw $-5, %ax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kandw %k1, %k2, %k2 -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: imulb %bl +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: imulb %dl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -3643,25 +3644,26 @@ ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: kshiftlw $13, %k2, %k2 ; AVX512-NEXT: kshiftrw $13, %k2, %k2 -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: imulb %r9b +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: imulb %r10b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: andb $1, %bl -; AVX512-NEXT: negb %bl -; AVX512-NEXT: cmpb %al, %bl -; AVX512-NEXT: setne %bl -; AVX512-NEXT: orb %cl, %bl +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: negb %dl +; AVX512-NEXT: cmpb %al, %dl +; AVX512-NEXT: setne %dl +; AVX512-NEXT: orb %cl, %dl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k3 ; AVX512-NEXT: kshiftlw $3, %k3, %k3 ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %r8d, %k2 +; AVX512-NEXT: andl $1, %r11d +; AVX512-NEXT: kmovw %r11d, %k2 ; AVX512-NEXT: kandw %k0, %k2, %k0 -; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kmovd %r8d, %k2 ; AVX512-NEXT: kshiftlw $15, %k2, %k2 ; AVX512-NEXT: kshiftrw $14, %k2, %k2 ; AVX512-NEXT: korw %k2, %k0, %k0 @@ -3680,6 +3682,7 @@ ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -1140,6 +1140,8 @@ ; AVX512-NEXT: kxorw %k2, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1180,7 +1180,9 @@ ; AVX512-NEXT: kandnw %k0, %k1, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -3188,52 +3188,53 @@ ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %k1, %r9d -; AVX512-NEXT: andb $1, %r9b +; AVX512-NEXT: kmovd %k1, %r8d +; AVX512-NEXT: andb $1, %r8b ; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %r9d +; AVX512-NEXT: andb $1, %r9b +; AVX512-NEXT: kshiftrw $2, %k0, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b -; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kshiftrw $2, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r11d ; AVX512-NEXT: andb $1, %r11b -; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k2, %ebx -; AVX512-NEXT: andb $1, %bl ; AVX512-NEXT: kshiftrw $1, %k0, %k2 -; AVX512-NEXT: kmovd %k2, %edx -; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: andb $1, %cl ; AVX512-NEXT: kshiftrw $1, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: kmovd %k1, %ecx -; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: kmovd %k1, %edx +; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %cl -; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: mulb %dl +; AVX512-NEXT: movl %eax, %edx ; AVX512-NEXT: seto %al -; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: testb $-2, %dl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: movw $-3, %ax ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kandw %k0, %k1, %k1 -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: mulb %sil -; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: movl %eax, %ebp ; AVX512-NEXT: seto %al -; AVX512-NEXT: testb $-2, %dl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: testb $-2, %bpl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k2 ; AVX512-NEXT: kshiftlw $15, %k2, %k2 @@ -3242,35 +3243,36 @@ ; AVX512-NEXT: movw $-5, %ax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kandw %k1, %k2, %k2 -; AVX512-NEXT: movl %r11d, %eax -; AVX512-NEXT: mulb %bl +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: mulb %r11b ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil -; AVX512-NEXT: setne %cl -; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %al, %bl ; AVX512-NEXT: setne %al ; AVX512-NEXT: kmovd %eax, %k3 ; AVX512-NEXT: kshiftlw $2, %k3, %k3 ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: kshiftlw $13, %k2, %k2 ; AVX512-NEXT: kshiftrw $13, %k2, %k2 -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: mulb %r10b +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: mulb %r9b ; AVX512-NEXT: # kill: def $al killed $al def $eax -; AVX512-NEXT: seto %cl +; AVX512-NEXT: seto %bl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %bl -; AVX512-NEXT: orb %cl, %bl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %bl, %cl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k3 ; AVX512-NEXT: kshiftlw $3, %k3, %k3 ; AVX512-NEXT: korw %k3, %k2, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %r8d, %k2 +; AVX512-NEXT: andl $1, %edx +; AVX512-NEXT: kmovw %edx, %k2 ; AVX512-NEXT: kandw %k0, %k2, %k0 -; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kmovd %ebp, %k2 ; AVX512-NEXT: kshiftlw $15, %k2, %k2 ; AVX512-NEXT: kshiftrw $14, %k2, %k2 ; AVX512-NEXT: korw %k2, %k0, %k0 @@ -3289,6 +3291,7 @@ ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -1227,7 +1227,9 @@ ; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)