Index: lib/Target/X86/X86CallingConv.td =================================================================== --- lib/Target/X86/X86CallingConv.td +++ lib/Target/X86/X86CallingConv.td @@ -73,8 +73,8 @@ CCIfSubtarget<"is64Bit()", CCIfByVal>>, CCIfByVal>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // Promote v8i1/v16i1/v32i1 arguments to i32. CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType>, @@ -146,8 +146,8 @@ ]>; def RetCC_#NAME : CallingConv<[ - // Promote i1, v8i1 arguments to i8. - CCIfType<[i1, v8i1], CCPromoteToType>, + // Promote i1, v1i1, v8i1 arguments to i8. + CCIfType<[i1, v1i1, v8i1], CCPromoteToType>, // Promote v16i1 arguments to i16. CCIfType<[v16i1], CCPromoteToType>, @@ -207,6 +207,7 @@ // // For code that doesn't care about the ABI, we allow returning more than two // integer values in registers. + CCIfType<[v1i1], CCPromoteToType>, CCIfType<[i1], CCPromoteToType>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, @@ -375,6 +376,7 @@ CCIfSwiftError>>, // For integers, ECX, R8D can be used as extra return registers. + CCIfType<[v1i1], CCPromoteToType>, CCIfType<[i1], CCPromoteToType>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>, @@ -485,8 +487,8 @@ // Handles byval parameters. CCIfByVal>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // The 'nest' parameter, if any, is passed in R10. CCIfNest>>, @@ -584,8 +586,8 @@ // FIXME: Handle byval stuff. // FIXME: Handle varargs. - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // The 'nest' parameter, if any, is passed in R10. CCIfNest>, @@ -796,8 +798,8 @@ ]>; def CC_X86_32_C : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // The 'nest' parameter, if any, is passed in ECX. CCIfNest>, @@ -816,8 +818,8 @@ // puts arguments in registers. CCIfByVal>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // If the call is not a vararg call, some arguments may be passed // in integer registers. @@ -828,8 +830,8 @@ ]>; def CC_X86_32_FastCall : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest>, @@ -858,15 +860,15 @@ ]>; def CC_X86_32_ThisCall_Mingw : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, CCDelegateTo ]>; def CC_X86_32_ThisCall_Win : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // Pass sret arguments indirectly through stack. CCIfSRet>, @@ -885,8 +887,8 @@ // puts arguments in registers. CCIfByVal>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest>, Index: lib/Target/X86/X86FastISel.cpp =================================================================== --- lib/Target/X86/X86FastISel.cpp +++ lib/Target/X86/X86FastISel.cpp @@ -3685,13 +3685,6 @@ switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: - if (Subtarget->hasAVX512()) { - // Need to copy to a VK1 register. - unsigned ResultReg = createResultReg(&X86::VK1RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg); - return ResultReg; - } case MVT::i8: return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, X86::sub_8bit); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1128,7 +1128,7 @@ addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::i1, &X86::VK1RegClass); + addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); @@ -1143,16 +1143,6 @@ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SETCC, MVT::i1, Custom); - setOperationAction(ISD::SETCCE, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::XOR, MVT::i1, Legal); - setOperationAction(ISD::OR, MVT::i1, Legal); - setOperationAction(ISD::AND, MVT::i1, Legal); - setOperationAction(ISD::SUB, MVT::i1, Custom); - setOperationAction(ISD::ADD, MVT::i1, Custom); - setOperationAction(ISD::MUL, MVT::i1, Custom); for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, @@ -1221,7 +1211,6 @@ setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); @@ -1299,7 +1288,9 @@ setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); @@ -1689,7 +1680,7 @@ LLVMContext& Context, EVT VT) const { if (!VT.isVector()) - return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; + return MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); @@ -2468,6 +2459,9 @@ SelectionDAG &DAG) { SDValue ValReturned = ValArg; + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); + if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); @@ -2490,7 +2484,6 @@ ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } - return DAG.getBitcast(ValVT, ValReturned); } @@ -2796,8 +2789,11 @@ SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) - : Val; + return ExtendedInMem + ? (VA.getValVT().isVector() + ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) + : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) + : Val; } // FIXME: Get this from tablegen. @@ -2947,7 +2943,7 @@ RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) + else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; @@ -6858,7 +6854,7 @@ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) - Immediate |= cast(In)->getZExtValue() << idx; + Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); @@ -6901,7 +6897,7 @@ if (!isa(In)) NonConstIdx.push_back(idx); else { - Immediate |= cast(In)->getZExtValue() << idx; + Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) @@ -13907,7 +13903,6 @@ SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); - assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); @@ -13941,8 +13936,8 @@ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -13953,7 +13948,7 @@ MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - if (Op.getSimpleValueType() == MVT::i1) + if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { @@ -14124,10 +14119,13 @@ return EltInVec; } - // Insertion of one bit into first or last position - // can be done with two SHIFTs + OR. + // Insertion of one bit into first position if (IdxVal == 0 ) { - // EltInVec already at correct index and other bits are 0. + // Clean top bits of vector. + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Clean the first bit in source vector. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); @@ -14136,6 +14134,7 @@ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } + // Insertion of one bit into last position if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, @@ -17284,8 +17283,7 @@ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) - && "SetCC type must be 8-bit or 1-bit integer"); + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -17419,7 +17417,7 @@ if (SSECC != 8) { if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0, + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); @@ -17467,9 +17465,10 @@ } // AVX512 fallback is to lower selects of scalar floats to masked moves. - if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) && - Subtarget.hasAVX512()) - return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2); + if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { + SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); + } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; @@ -19012,8 +19011,8 @@ /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is coming as MVT::i8 and it should be truncated -/// to MVT::i1 while lowering masking intrinsics. +/// The mask is coming as MVT::i8 and it should be transformed +/// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. @@ -19026,11 +19025,10 @@ MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND) + Op.getOpcode() == X86ISD::FSETCCM_RND) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); @@ -19469,10 +19467,11 @@ SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); - SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask, + DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: case CMP_MASK_CC: { @@ -19532,18 +19531,18 @@ if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask, + DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -19591,13 +19590,13 @@ SDValue FCmp; if (isRoundModeCurDirection(Sae)) - FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8)); else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); - // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" - return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); + FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8), Sae); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp, + DAG.getIntPtrConstant(0, dl)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), @@ -23340,8 +23339,6 @@ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); - EVT EltVT = NVT.getVectorElementType(); - SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { @@ -23359,6 +23356,8 @@ for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); + EVT EltVT = InOp.getOperand(0).getValueType(); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) @@ -29525,7 +29524,7 @@ if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); @@ -31272,13 +31271,11 @@ // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { - SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00, - CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); - if (N->getValueType(0) != MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), - FSetCC); - return FSetCC; + SDValue FSetCC = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0), + FSetCC, DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -31,8 +31,7 @@ RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); // The mask VT. - ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", - "v" # NumElts # "i1")); + ValueType KVT = !cast("v" # NumElts # "i1"); // Suffix used in the instruction mnemonic. string Suffix = suffix; @@ -2263,7 +2262,7 @@ let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(i1 (load addr:$src)), + def : Pat<(v1i1 (load addr:$src)), (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>; def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), (KMOVWkm addr:$src)>; @@ -2280,77 +2279,45 @@ } let Predicates = [HasAVX512] in { - def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), - (i32 1)), VK1)>; + multiclass operation_gpr_mask_copy_lowering { + def : Pat<(maskVT (scalar_to_vector GR32:$src)), + (COPY_TO_REGCLASS GR32:$src, maskRC)>; - def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>; + def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), + (COPY_TO_REGCLASS maskRC:$src, GR32)>; - def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS GR32:$src, VK1)>; + def : Pat<(maskVT (scalar_to_vector GR8:$src)), + (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; - def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit), (i32 1)), VK1)>; + def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; - def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit), (i32 1)), VK1)>; - - def : Pat<(i32 (zext VK1:$src)), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>; - - def : Pat<(i32 (anyext VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, GR32)>; - - def : Pat<(i8 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>; - - def : Pat<(i8 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>; + def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), + (COPY_TO_REGCLASS maskRC:$src, GR32)>; + } - def : Pat<(i64 (zext VK1:$src)), - (SUBREG_TO_REG (i64 0), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; - def : Pat<(i64 (anyext VK1:$src)), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>; + def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK1)>; + def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK16)>; + def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK8)>; - def : Pat<(i16 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>; - - def : Pat<(i16 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>; -} -def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; -def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -def : Pat<(v4i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK4)>; -def : Pat<(v2i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK2)>; -def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; -def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; - -def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; - -def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>; +} // Mask unary operation // - KNOT @@ -2551,14 +2518,11 @@ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>; def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>; + def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; - let AddedComplexity = 10 in { // To optimize isel table. - def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; - def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - } + def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 @@ -2570,6 +2534,12 @@ def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), (VT (COPY_TO_REGCLASS subRC:$src, RC))>; } +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; defm : operation_subvector_mask_lowering; defm : operation_subvector_mask_lowering; @@ -3249,7 +3219,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT _.FRC:$src2))))))), (COPY_TO_REGCLASS (!cast(InstrStr#rrk) @@ -3260,7 +3230,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT ZeroFP))))))), (COPY_TO_REGCLASS (!cast(InstrStr#rrkz) @@ -3279,7 +3249,7 @@ (iPTR 0))), (iPTR 0)))), (!cast(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3296,7 +3266,7 @@ (iPTR 0))), (iPTR 0)))), (!cast(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3310,7 +3280,7 @@ (v16i32 immAllZerosV))))), (iPTR 0))), (!cast(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3322,7 +3292,7 @@ (iPTR 0))))), (iPTR 0))), (!cast(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; } @@ -3338,7 +3308,7 @@ (v16i32 immAllZerosV))))), (iPTR 0))), (!cast(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3350,7 +3320,7 @@ (iPTR 0))))), (iPTR 0))), (!cast(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; } @@ -3381,7 +3351,7 @@ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), - (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)), + (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -274,7 +274,7 @@ SDTCisSameNumEltsAs<0, 1>]>>; def X86selects : SDNode<"X86ISD::SELECTS", - SDTypeProfile<1, 3, [SDTCisVT<1, i1>, + SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>>; @@ -441,7 +441,7 @@ SDTCisSameNumEltsAs<0,1>, SDTCisVT<2, i32>]>, []>; def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", - SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", @@ -451,7 +451,7 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; Index: lib/Target/X86/X86RegisterInfo.td =================================================================== --- lib/Target/X86/X86RegisterInfo.td +++ lib/Target/X86/X86RegisterInfo.td @@ -511,7 +511,7 @@ 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} @@ -519,7 +519,7 @@ def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -47,16 +47,20 @@ ret float %c1 } -; FIXME: Can use vcmpeqss and extract from the mask here in AVX512. define i32 @test3(float %a, float %b) { -; ALL-LABEL: test3: -; ALL: ## BB#0: -; ALL-NEXT: vucomiss %xmm1, %xmm0 -; ALL-NEXT: setnp %al -; ALL-NEXT: sete %cl -; ALL-NEXT: andb %al, %cl -; ALL-NEXT: movzbl %cl, %eax -; ALL-NEXT: retq +; KNL-LABEL: test3: +; KNL: ## BB#0: +; KNL-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test3: +; SKX: ## BB#0: +; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: retq %cmp10.i = fcmp oeq float %a, %b %conv11.i = zext i1 %cmp10.i to i32 @@ -69,7 +73,7 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: jp LBB3_1 ; ALL-NEXT: ## BB#2: ## %return ; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end @@ -158,47 +162,22 @@ } define i32 @test10(i64 %b, i64 %c, i1 %d) { -; KNL-LABEL: test10: -; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edx -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: sete %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB8_1 -; KNL-NEXT: ## BB#2: ## %if.end.i -; KNL-NEXT: movl $6, %eax -; KNL-NEXT: retq -; KNL-NEXT: LBB8_1: ## %if.then.i -; KNL-NEXT: movl $5, %eax -; KNL-NEXT: retq -; -; SKX-LABEL: test10: -; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edx -; SKX-NEXT: kmovd %edx, %k0 -; SKX-NEXT: cmpq %rsi, %rdi -; SKX-NEXT: sete %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: korw %k1, %k0, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al -; SKX-NEXT: je LBB8_1 -; SKX-NEXT: ## BB#2: ## %if.end.i -; SKX-NEXT: movl $6, %eax -; SKX-NEXT: retq -; SKX-NEXT: LBB8_1: ## %if.then.i -; SKX-NEXT: movl $5, %eax -; SKX-NEXT: retq +; ALL-LABEL: test10: +; ALL: ## BB#0: +; ALL-NEXT: movl %edx, %eax +; ALL-NEXT: andb $1, %al +; ALL-NEXT: cmpq %rsi, %rdi +; ALL-NEXT: sete %cl +; ALL-NEXT: orb %dl, %cl +; ALL-NEXT: andb $1, %cl +; ALL-NEXT: cmpb %cl, %al +; ALL-NEXT: je LBB8_1 +; ALL-NEXT: ## BB#2: ## %if.end.i +; ALL-NEXT: movl $6, %eax +; ALL-NEXT: retq +; ALL-NEXT: LBB8_1: ## %if.then.i +; ALL-NEXT: movl $5, %eax +; ALL-NEXT: retq %cmp8.i = icmp eq i64 %b, %c %or1 = or i1 %d, %cmp8.i Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1552,10 +1552,10 @@ ; NOVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NOVL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpextrq $1, %xmm0, %rax +; NOVL-NEXT: vpextrb $8, %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 -; NOVL-NEXT: vmovq %xmm0, %rax +; NOVL-NEXT: vpextrb $0, %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 ; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1434,26 +1434,26 @@ define i16 @trunc_i32_to_i1(i32 %a) { ; KNL-LABEL: trunc_i32_to_i1: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edi -; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: movw $-4, %ax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_i32_to_i1: ; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: movw $-4, %ax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k1 -; SKX-NEXT: kshiftlw $1, %k1, %k1 -; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kmovd %eax, %k0 +; SKX-NEXT: kshiftrw $1, %k0, %k0 +; SKX-NEXT: kshiftlw $1, %k0, %k0 +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq Index: test/CodeGen/X86/avx512-fsel.ll =================================================================== --- test/CodeGen/X86/avx512-fsel.ll +++ test/CodeGen/X86/avx512-fsel.ll @@ -10,25 +10,11 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vucomiss %xmm1, %xmm0 -; CHECK-NEXT: setp %al -; CHECK-NEXT: setne %cl -; CHECK-NEXT: setnp %dl -; CHECK-NEXT: sete %sil -; CHECK-NEXT: andb %dl, %sil -; CHECK-NEXT: ## implicit-def: %EDI -; CHECK-NEXT: movb %sil, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: ## implicit-def: %EDI -; CHECK-NEXT: movb %cl, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kmovw %k1, %edi -; CHECK-NEXT: movb %dil, %al -; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: xorb $-1, %cl +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: jmp LBB0_2 ; CHECK-NEXT: LBB0_1: ## %L_0 Index: test/CodeGen/X86/avx512-i1test.ll =================================================================== --- test/CodeGen/X86/avx512-i1test.ll +++ test/CodeGen/X86/avx512-i1test.ll @@ -66,14 +66,13 @@ define i64 @func2(i1 zeroext %i, i32 %j) { ; CHECK-LABEL: func2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: je .LBB1_1 ; CHECK-NEXT: # BB#2: # %if.then ; CHECK-NEXT: jmp bar # TAILCALL ; CHECK-NEXT: .LBB1_1: # %return -; CHECK-NEXT: orq $-2, %rdi -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orq $-2, %rax ; CHECK-NEXT: retq entry: %tobool = icmp eq i32 %j, 0 Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -260,8 +260,7 @@ ; KNL-NEXT: kshiftlw $11, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: testb $1, %al ; KNL-NEXT: je LBB10_2 ; KNL-NEXT: ## BB#1: ## %A ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -276,8 +275,7 @@ ; SKX-NEXT: kshiftlw $11, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al +; SKX-NEXT: testb $1, %al ; SKX-NEXT: je LBB10_2 ; SKX-NEXT: ## BB#1: ## %A ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -299,13 +297,10 @@ ; KNL-LABEL: test12: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 -; KNL-NEXT: vpcmpgtq %zmm1, %zmm3, %k1 -; KNL-NEXT: kunpckbw %k0, %k1, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: testb $1, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: retq @@ -313,13 +308,10 @@ ; SKX-LABEL: test12: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 -; SKX-NEXT: vpcmpgtq %zmm1, %zmm3, %k1 -; SKX-NEXT: kunpckbw %k0, %k1, %k0 -; SKX-NEXT: kshiftlw $15, %k0, %k0 -; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al +; SKX-NEXT: testb $1, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax ; SKX-NEXT: vzeroupper @@ -335,13 +327,13 @@ ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al +; KNL-NEXT: movw $-4, %cx +; KNL-NEXT: kmovw %ecx, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k0 -; KNL-NEXT: movw $-4, %ax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX ; KNL-NEXT: retq @@ -350,13 +342,13 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al +; SKX-NEXT: movw $-4, %cx +; SKX-NEXT: kmovd %ecx, %k0 +; SKX-NEXT: kshiftrw $1, %k0, %k0 +; SKX-NEXT: kshiftlw $1, %k0, %k0 ; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: movw $-4, %ax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k1 -; SKX-NEXT: kshiftlw $1, %k1, %k1 -; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq @@ -373,8 +365,7 @@ ; KNL-NEXT: kshiftlw $11, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: testb $1, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: retq @@ -385,8 +376,7 @@ ; SKX-NEXT: kshiftlb $3, %k0, %k0 ; SKX-NEXT: kshiftrb $7, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al +; SKX-NEXT: testb $1, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax ; SKX-NEXT: vzeroupper @@ -424,14 +414,13 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL-LABEL: test16: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %esi, %k2 +; KNL-NEXT: movb (%rdi), %al +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; KNL-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; KNL-NEXT: vpslld $31, %zmm2, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -440,14 +429,13 @@ ; ; SKX-LABEL: test16: ; SKX: ## BB#0: -; SKX-NEXT: movzbl (%rdi), %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: movb (%rdi), %al +; SKX-NEXT: kmovd %esi, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpmovm2d %k1, %zmm0 ; SKX-NEXT: vpmovm2d %k0, %zmm1 ; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; SKX-NEXT: vpmovd2m %zmm2, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX @@ -463,14 +451,13 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL-LABEL: test17: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %esi, %k2 +; KNL-NEXT: movb (%rdi), %al +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -479,14 +466,13 @@ ; ; SKX-LABEL: test17: ; SKX: ## BB#0: -; SKX-NEXT: movzbl (%rdi), %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: movb (%rdi), %al +; SKX-NEXT: kmovd %esi, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpmovm2q %k1, %zmm0 ; SKX-NEXT: vpmovm2q %k0, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; SKX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX @@ -1283,12 +1269,11 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 +; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 +; SKX-NEXT: kunpckwd %k0, %k1, %k0 +; SKX-NEXT: vpmovm2w %k0, %zmm0 ; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k1 -; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k2 -; SKX-NEXT: kunpckwd %k1, %k2, %k1 -; SKX-NEXT: vpmovm2w %k1, %zmm0 ; SKX-NEXT: vpmovm2w %k0, %zmm1 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 @@ -1308,33 +1293,29 @@ ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpextrd $1, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; KNL-NEXT: vmovd %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpextrb $4, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 -; KNL-NEXT: vptestmq %zmm1, %zmm1, %k2 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL-NEXT: vpextrd $3, %xmm0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpextrb $12, %xmm0, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] @@ -1349,10 +1330,9 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k1 -; SKX-NEXT: vpmovm2d %k1, %xmm0 ; SKX-NEXT: vpmovm2d %k0, %xmm1 ; SKX-NEXT: vpbroadcastq %xmm1, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] @@ -1373,16 +1353,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -1396,13 +1374,12 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlw $1, %k1, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k1 ; SKX-NEXT: kshiftlw $1, %k0, %k0 -; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kshiftrw $1, %k0, %k0 +; SKX-NEXT: korw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq @@ -1422,8 +1399,10 @@ ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpextrb $0, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v2i1: @@ -1432,11 +1411,10 @@ ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1452,8 +1430,10 @@ ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpextrb $0, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: extractelement_v2i1_alt: @@ -1462,11 +1442,10 @@ ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1535,8 +1514,10 @@ ; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpextrb $15, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v64i1: @@ -1544,11 +1525,10 @@ ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b @@ -1566,8 +1546,10 @@ ; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpextrb $15, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: extractelement_v64i1_alt: @@ -1575,11 +1557,10 @@ ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b @@ -2332,7 +2313,7 @@ ; SKX-NEXT: vpmovm2q %k0, %xmm0 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: andl $1, %edi -; SKX-NEXT: movl -24(%rsp,%rdi,8), %eax +; SKX-NEXT: movzbl -24(%rsp,%rdi,8), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b @@ -2362,7 +2343,7 @@ ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: andl $3, %edi -; SKX-NEXT: movl -24(%rsp,%rdi,4), %eax +; SKX-NEXT: movzbl -24(%rsp,%rdi,4), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq %t1 = icmp ugt <4 x i32> %a, %b @@ -2391,7 +2372,7 @@ ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) ; KNL-NEXT: andl $7, %edi -; KNL-NEXT: movl (%rsp,%rdi,8), %eax +; KNL-NEXT: movzbl (%rsp,%rdi,8), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -2414,7 +2395,7 @@ ; SKX-NEXT: vpmovm2q %k0, %zmm0 ; SKX-NEXT: vmovdqa64 %zmm0, (%rsp) ; SKX-NEXT: andl $7, %edi -; SKX-NEXT: movl (%rsp,%rdi,8), %eax +; SKX-NEXT: movzbl (%rsp,%rdi,8), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -2444,7 +2425,7 @@ ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa32 %zmm0, (%rsp) ; KNL-NEXT: andl $15, %edi -; KNL-NEXT: movl (%rsp,%rdi,4), %eax +; KNL-NEXT: movzbl (%rsp,%rdi,4), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -2467,7 +2448,7 @@ ; SKX-NEXT: vpmovm2d %k0, %zmm0 ; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) ; SKX-NEXT: andl $15, %edi -; SKX-NEXT: movl (%rsp,%rdi,4), %eax +; SKX-NEXT: movzbl (%rsp,%rdi,4), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -2500,9 +2481,8 @@ ; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: andl $31, %edi ; KNL-NEXT: movq %rsp, %rax -; KNL-NEXT: movb (%rdi,%rax), %al -; KNL-NEXT: andb $1, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: movzbl (%rdi,%rax), %eax +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -2524,7 +2504,7 @@ ; SKX-NEXT: vpmovm2w %k0, %zmm0 ; SKX-NEXT: vmovdqu16 %zmm0, (%rsp) ; SKX-NEXT: andl $31, %edi -; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax +; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp Index: test/CodeGen/X86/avx512-insert-extract_i1.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract_i1.ll +++ test/CodeGen/X86/avx512-insert-extract_i1.ll @@ -22,9 +22,8 @@ ; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) ; SKX-NEXT: andl $63, %edi ; SKX-NEXT: movq %rsp, %rax -; SKX-NEXT: movb (%rdi,%rax), %al -; SKX-NEXT: andb $1, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: movzbl (%rdi,%rax), %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2881,23 +2881,23 @@ ; CHECK-LABEL: test_mask_vextractf32x4: ; CHECK: ## BB#0: ; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kshiftlw $12, %k1, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kshiftlw $12, %k0, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kshiftlw $13, %k0, %k2 ; CHECK-NEXT: kshiftrw $15, %k2, %k2 -; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftlw $15, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kshiftlw $14, %k1, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kshiftlw $14, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: kmovw %k3, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm2 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; CHECK-NEXT: kmovw %k2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq @@ -2911,23 +2911,23 @@ ; CHECK-LABEL: test_mask_vextracti64x4: ; CHECK: ## BB#0: ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kshiftlw $12, %k1, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kshiftlw $12, %k0, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kshiftlw $13, %k0, %k2 ; CHECK-NEXT: kshiftrw $15, %k2, %k2 -; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftlw $15, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kshiftlw $14, %k1, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kshiftlw $14, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: kmovw %k3, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm2 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; CHECK-NEXT: kmovw %k2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 @@ -2942,23 +2942,23 @@ ; CHECK-LABEL: test_maskz_vextracti32x4: ; CHECK: ## BB#0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kshiftlw $12, %k1, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kshiftlw $12, %k0, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kshiftlw $13, %k0, %k2 ; CHECK-NEXT: kshiftrw $15, %k2, %k2 -; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftlw $15, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kshiftlw $14, %k1, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kshiftlw $14, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: kmovw %k3, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm1 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %k2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -121,6 +121,8 @@ ; CHECK-NEXT: kmovw %eax, %k2 ; CHECK-NEXT: kxorw %k0, %k1, %k0 ; CHECK-NEXT: kxorw %k0, %k2, %k0 +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq @@ -269,7 +271,6 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_sqrt_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} @@ -296,7 +297,6 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_sqrt_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} @@ -2214,7 +2214,6 @@ define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2226,7 +2225,6 @@ define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_rd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2238,7 +2236,6 @@ define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_ru: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2250,7 +2247,6 @@ define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_rz: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2262,7 +2258,6 @@ define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_current: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2274,7 +2269,6 @@ define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_ss_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2294,7 +2288,6 @@ define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -2311,7 +2304,6 @@ define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_ss_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2329,7 +2321,6 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2341,7 +2332,6 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_rd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2353,7 +2343,6 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_ru: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2365,7 +2354,6 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_rz: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2377,7 +2365,6 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_current: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2389,7 +2376,6 @@ define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_sd_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2409,7 +2395,6 @@ define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 @@ -2424,7 +2409,6 @@ define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_sd_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2440,7 +2424,6 @@ define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2452,7 +2435,6 @@ define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_ss_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2472,7 +2454,6 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2484,7 +2465,6 @@ define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2504,7 +2484,6 @@ define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -2521,7 +2500,6 @@ define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_ss_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2538,7 +2516,6 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_sd_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2550,7 +2527,6 @@ define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_sd_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2570,7 +2546,6 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2582,7 +2557,6 @@ define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2602,7 +2576,6 @@ define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_sd_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 @@ -2617,7 +2590,6 @@ define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_sd_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -3651,16 +3623,15 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_getexp_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm4, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) @@ -3678,16 +3649,15 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_getexp_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm3 -; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z} ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm5, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) @@ -3705,11 +3675,9 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq @@ -3720,18 +3688,18 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all: ; CHECK: ## BB#0: +; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %ecx ; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0 -; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k1 -; CHECK-NEXT: korw %k0, %k1, %k0 -; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k1 -; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k2 -; CHECK-NEXT: korw %k1, %k2, %k1 -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k2 -; CHECK-NEXT: kandw %k2, %k1, %k1 -; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %edx +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %esi +; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orb %cl, %dl +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: orb %dl, %al ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq @@ -3751,11 +3719,9 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq @@ -3767,17 +3733,17 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all: ; CHECK: ## BB#0: -; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k1 -; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %edx ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k2 {%k1} -; CHECK-NEXT: kmovw %k2, %ecx -; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1} -; CHECK-NEXT: kmovw %k1, %edx -; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %esi +; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: andb %cl, %dl +; CHECK-NEXT: andb %sil, %al ; CHECK-NEXT: andb %dl, %al ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq @@ -3898,15 +3864,14 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm3 -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z} ; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vaddpd %xmm5, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) @@ -3924,14 +3889,13 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm1 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) @@ -4056,7 +4020,6 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0 @@ -4073,7 +4036,6 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0 @@ -4596,7 +4558,6 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} @@ -4620,16 +4581,15 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 -; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm4 -; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 +; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4690,16 +4650,15 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 -; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm4 -; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 +; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4714,7 +4673,6 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} @@ -4815,17 +4773,16 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm4 -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vmovapd %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 +; CHECK-NEXT: vaddpd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4843,17 +4800,16 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm4 -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm1 +; CHECK-NEXT: vaddps %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4871,7 +4827,6 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} @@ -4889,7 +4844,6 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -4903,17 +4857,16 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 -; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm4 -; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -4931,17 +4884,16 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -4958,7 +4910,6 @@ ; CHECK-LABEL: fmadd_ss_mask_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} ; CHECK-NEXT: vmovss %xmm0, (%rdi) @@ -4986,7 +4937,6 @@ ; CHECK-LABEL: fmadd_ss_maskz_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vmovss %xmm0, (%rdi) @@ -5014,7 +4964,6 @@ ; CHECK-LABEL: fmadd_sd_mask_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} ; CHECK-NEXT: vmovlpd %xmm0, (%rdi) @@ -5038,7 +4987,6 @@ ; CHECK-LABEL: fmadd_sd_maskz_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vmovlpd %xmm0, (%rdi) @@ -5063,17 +5011,16 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 -; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm4 -; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -5091,17 +5038,16 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -5119,17 +5065,16 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 -; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm4 -; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) @@ -5147,17 +5092,16 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm5, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) @@ -5173,7 +5117,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -5187,7 +5130,6 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -5201,7 +5143,8 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %q = load float, float* %ptr_b Index: test/CodeGen/X86/avx512-load-store.ll =================================================================== --- test/CodeGen/X86/avx512-load-store.ll +++ test/CodeGen/X86/avx512-load-store.ll @@ -12,7 +12,7 @@ ; CHECK32-LABEL: test_mm_mask_move_ss: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; CHECK32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] @@ -37,7 +37,7 @@ ; CHECK32-LABEL: test_mm_maskz_move_ss: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK32-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} @@ -62,7 +62,7 @@ ; CHECK32-LABEL: test_mm_mask_move_sd: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; CHECK32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] @@ -87,7 +87,7 @@ ; CHECK32-LABEL: test_mm_maskz_move_sd: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK32-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} Index: test/CodeGen/X86/avx512-mask-bugfix.ll =================================================================== --- test/CodeGen/X86/avx512-mask-bugfix.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - -; ModuleID = 'foo.ll' -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind readnone -declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #0 - -; Function Attrs: nounwind readnone -declare i64 @llvm.cttz.i64(i64, i1) #0 - -; Function Attrs: nounwind -define void @foo(float* noalias %aFOO, float %b, i32 %a) { -allocas: - %full_mask_memory.i57 = alloca <8 x float> - %return_value_memory.i60 = alloca i1 - %cmp.i = icmp eq i32 %a, 65535 - br i1 %cmp.i, label %all_on, label %some_on - -all_on: - %mask0 = load <8 x float>, <8 x float>* %full_mask_memory.i57 - %v0.i.i.i70 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) #0 - %allon.i.i76 = icmp eq i32 %v0.i.i.i70, 65535 - br i1 %allon.i.i76, label %check_neighbors.i.i121, label %domixed.i.i100 - -domixed.i.i100: - br label %check_neighbors.i.i121 - -check_neighbors.i.i121: - %v1.i5.i.i116 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) #0 - %alleq.i.i120 = icmp eq i32 %v1.i5.i.i116, 65535 - br i1 %alleq.i.i120, label %all_equal.i.i123, label %not_all_equal.i.i124 - -; CHECK: kxnorw %k0, %k0, %k0 -; CHECK: kshiftrw $15, %k0, %k0 -; CHECK: jmp -; CHECK: kxorw %k0, %k0, %k0 - -all_equal.i.i123: - br label %reduce_equal___vyi.exit128 - -not_all_equal.i.i124: - br label %reduce_equal___vyi.exit128 - -reduce_equal___vyi.exit128: - %calltmp2.i125 = phi i1 [ true, %all_equal.i.i123 ], [ false, %not_all_equal.i.i124 ] - store i1 %calltmp2.i125, i1* %return_value_memory.i60 - %return_value.i126 = load i1, i1* %return_value_memory.i60 - %. = select i1 %return_value.i126, i32 1, i32 0 - %select_to_float = sitofp i32 %. to float - ret void - -some_on: - ret void -} - Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -418,7 +418,7 @@ ; KNL-NEXT: kshiftlw $10, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: andb $1, %al ; KNL-NEXT: ## kill: %AL %AL %EAX ; KNL-NEXT: retq ; @@ -428,7 +428,7 @@ ; SKX-NEXT: kshiftlw $10, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax +; SKX-NEXT: andb $1, %al ; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -439,7 +439,7 @@ ; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: andb $1, %al ; AVX512BW-NEXT: ## kill: %AL %AL %EAX ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -450,7 +450,7 @@ ; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: andb $1, %al ; AVX512DQ-NEXT: ## kill: %AL %AL %EAX ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -965,8 +965,8 @@ ; SKX-LABEL: test16: ; SKX: ## BB#0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 +; SKX-NEXT: movb $1, %al +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ; SKX-NEXT: vpmovm2b %k0, %zmm1 @@ -981,8 +981,8 @@ ; AVX512BW-LABEL: test16: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 +; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 ; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 @@ -1085,7 +1085,6 @@ ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al -; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 @@ -1103,7 +1102,6 @@ ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi ; AVX512BW-NEXT: setg %al -; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 ; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0 @@ -1166,21 +1164,25 @@ ; KNL-LABEL: test18: ; KNL: ## BB#0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftlw $7, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k2 ; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kshiftlw $7, %k0, %k0 -; KNL-NEXT: korw %k0, %k1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq @@ -1191,16 +1193,20 @@ ; SKX-NEXT: kmovd %esi, %k1 ; SKX-NEXT: kshiftlw $7, %k1, %k2 ; SKX-NEXT: kshiftrw $15, %k2, %k2 +; SKX-NEXT: kmovd %k2, %eax ; SKX-NEXT: kshiftlw $6, %k1, %k1 ; SKX-NEXT: kshiftrw $15, %k1, %k1 +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: vpmovm2q %k1, %zmm1 +; SKX-NEXT: kmovd %ecx, %k0 +; SKX-NEXT: vpmovm2q %k0, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 -; SKX-NEXT: kshiftlb $7, %k2, %k1 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper @@ -1209,21 +1215,25 @@ ; AVX512BW-LABEL: test18: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: kshiftlw $7, %k2, %k0 -; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512BW-NEXT: kmovd %esi, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k2 ; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 +; AVX512BW-NEXT: kmovd %k2, %eax +; AVX512BW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kshiftlw $7, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -1235,16 +1245,20 @@ ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: kshiftlw $7, %k1, %k2 ; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k2, %eax ; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 ; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovm2q %k1, %zmm1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpmovq2m %zmm2, %k0 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 @@ -1383,10 +1397,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; KNL-LABEL: store_v1i1: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kxnorw %k0, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rsi) @@ -1394,20 +1406,16 @@ ; ; SKX-LABEL: store_v1i1: ; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edi ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rsi) ; SKX-NEXT: retq ; ; AVX512BW-LABEL: store_v1i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: andl $1, %edi ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rsi) @@ -1415,10 +1423,8 @@ ; ; AVX512DQ-LABEL: store_v1i1: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: andl $1, %edi ; AVX512DQ-NEXT: kmovw %edi, %k0 ; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 ; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rsi) ; AVX512DQ-NEXT: retq @@ -1613,59 +1619,14 @@ @f1.v = internal unnamed_addr global i1 false, align 4 define void @f1(i32 %c) { -; KNL-LABEL: f1: -; KNL: ## BB#0: ## %entry -; KNL-NEXT: movzbl {{.*}}(%rip), %edi -; KNL-NEXT: movl %edi, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k0 -; KNL-NEXT: kxnorw %k0, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, {{.*}}(%rip) -; KNL-NEXT: xorl $1, %edi -; KNL-NEXT: jmp _f2 ## TAILCALL -; -; SKX-LABEL: f1: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: movzbl {{.*}}(%rip), %edi -; SKX-NEXT: movl %edi, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovb %k0, {{.*}}(%rip) -; SKX-NEXT: xorl $1, %edi -; SKX-NEXT: jmp _f2 ## TAILCALL -; -; AVX512BW-LABEL: f1: -; AVX512BW: ## BB#0: ## %entry -; AVX512BW-NEXT: movzbl {{.*}}(%rip), %edi -; AVX512BW-NEXT: movl %edi, %eax -; AVX512BW-NEXT: andl $1, %eax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, {{.*}}(%rip) -; AVX512BW-NEXT: xorl $1, %edi -; AVX512BW-NEXT: jmp _f2 ## TAILCALL -; -; AVX512DQ-LABEL: f1: -; AVX512DQ: ## BB#0: ## %entry -; AVX512DQ-NEXT: movzbl {{.*}}(%rip), %edi -; AVX512DQ-NEXT: movl %edi, %eax -; AVX512DQ-NEXT: andl $1, %eax -; AVX512DQ-NEXT: kmovw %eax, %k0 -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kmovb %k0, {{.*}}(%rip) -; AVX512DQ-NEXT: xorl $1, %edi -; AVX512DQ-NEXT: jmp _f2 ## TAILCALL +; CHECK-LABEL: f1: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movzbl {{.*}}(%rip), %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorb $1, %al +; CHECK-NEXT: movb %al, {{.*}}(%rip) +; CHECK-NEXT: xorl $1, %edi +; CHECK-NEXT: jmp _f2 ## TAILCALL entry: %.b1 = load i1, i1* @f1.v, align 4 %not..b1 = xor i1 %.b1, true Index: test/CodeGen/X86/avx512-memfold.ll =================================================================== --- test/CodeGen/X86/avx512-memfold.ll +++ test/CodeGen/X86/avx512-memfold.ll @@ -4,11 +4,9 @@ define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vcmpunordss (%rdi), %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %b.val = load float, float* %b @@ -24,7 +22,6 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -41,7 +38,6 @@ define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) { ; CHECK-LABEL: test_maskz_add_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -61,7 +57,6 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-NoMask.ll +++ test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -1,16 +1,10 @@ -; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=X32 %s -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=ALL --check-prefix=X32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=ALL --check-prefix=WIN64 %s ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=LINUXOSX64 %s -; X32-LABEL: test_argReti1: -; X32: kmov{{.*}} %eax, %k{{[0-7]}} -; X32: kmov{{.*}} %k{{[0-7]}}, %eax -; X32: ret{{.*}} - -; WIN64-LABEL: test_argReti1: -; WIN64: kmov{{.*}} %eax, %k{{[0-7]}} -; WIN64: kmov{{.*}} %k{{[0-7]}}, %eax -; WIN64: ret{{.*}} +; ALL-LABEL: test_argReti1: +; ALL: incb %al +; ALL: ret{{.*}} ; Test regcall when receiving/returning i1 define x86_regcallcc i1 @test_argReti1(i1 %a) { @@ -18,17 +12,11 @@ ret i1 %add } -; X32-LABEL: test_CallargReti1: -; X32: kmov{{.*}} %k{{[0-7]}}, %eax -; X32: call{{.*}} {{.*}}test_argReti1 -; X32: kmov{{.*}} %eax, %k{{[0-7]}} -; X32: ret{{.*}} - -; WIN64-LABEL: test_CallargReti1: -; WIN64: kmov{{.*}} %k{{[0-7]}}, %eax -; WIN64: call{{.*}} {{.*}}test_argReti1 -; WIN64: kmov{{.*}} %eax, %k{{[0-7]}} -; WIN64: ret{{.*}} +; ALL-LABEL: test_CallargReti1: +; ALL: movzbl %al, %eax +; ALL: call{{.*}}test_argReti1 +; ALL: incb %al +; ALL: ret{{.*}} ; Test regcall when passing/retrieving i1 define x86_regcallcc i1 @test_CallargReti1(i1 %a) { Index: test/CodeGen/X86/avx512-select.ll =================================================================== --- test/CodeGen/X86/avx512-select.ll +++ test/CodeGen/X86/avx512-select.ll @@ -161,7 +161,7 @@ define double @pr30561_f64(double %b, double %a, i1 %c) { ; CHECK-LABEL: pr30561_f64: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -172,7 +172,7 @@ define float @pr30561_f32(float %b, float %a, i1 %c) { ; CHECK-LABEL: pr30561_f32: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -13,10 +13,9 @@ ; CHECK-NEXT: kshiftlb $6, %k0, %k0 ; CHECK-NEXT: kshiftrb $7, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vmovq %rax, %xmm2 -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: vmovq %rax, %xmm3 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; CHECK-NEXT: kmovw %k1, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 ; CHECK-NEXT: vpsraq $63, %zmm2, %zmm2 ; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 Index: test/CodeGen/X86/avx512dq-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics.ll +++ test/CodeGen/X86/avx512dq-intrinsics.ll @@ -262,7 +262,6 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vreducess $4, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vreducess $4, {sae}, %xmm1, %xmm0, %xmm0 @@ -279,7 +278,6 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_range_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm0 @@ -296,7 +294,6 @@ define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0 @@ -313,7 +310,6 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_range_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 @@ -367,14 +363,11 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addb %cl, %al ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq @@ -389,14 +382,11 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addb %cl, %al ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512er-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512er-intrinsics.ll +++ test/CodeGen/X86/avx512er-intrinsics.ll @@ -121,9 +121,7 @@ define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { ; CHECK-LABEL: test_rsqrt28_ss_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; ret <4 x float> %res @@ -132,10 +130,7 @@ define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { ; CHECK-LABEL: test_rsqrt28_ss_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; ret <4 x float> %res @@ -144,9 +139,7 @@ define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { ; CHECK-LABEL: test_rsqrt28_sd_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] +; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; ret <2 x double> %res @@ -155,10 +148,7 @@ define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) { ; CHECK-LABEL: test_rsqrt28_sd_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] +; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xcd,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ; ret <2 x double> %res @@ -169,9 +159,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] +; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0xcd,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] %mem = load double , double * %ptr, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 @@ -182,9 +170,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] +; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0xcd,0x47,0x12] ; CHECK-NEXT: retq # encoding: [0xc3] %ptr1 = getelementptr double, double* %ptr, i32 18 %mem = load double , double * %ptr1, align 8 Index: test/CodeGen/X86/fast-isel-load-i1.ll =================================================================== --- test/CodeGen/X86/fast-isel-load-i1.ll +++ test/CodeGen/X86/fast-isel-load-i1.ll @@ -4,9 +4,7 @@ define i1 @test_i1(i1* %b) { ; CHECK-LABEL: test_i1: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: testb $1, (%rdi) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # BB#1: # %in ; CHECK-NEXT: xorl %eax, %eax Index: test/CodeGen/X86/fma-fneg-combine.ll =================================================================== --- test/CodeGen/X86/fma-fneg-combine.ll +++ test/CodeGen/X86/fma-fneg-combine.ll @@ -141,7 +141,6 @@ ; SKX-LABEL: test11: ; SKX: # BB#0: # %entry ; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0 -; SKX-NEXT: andl $1, %edi ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq @@ -150,7 +149,6 @@ ; KNL: # BB#0: # %entry ; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm0 ; KNL-NEXT: vxorps %xmm0, %xmm2, %xmm0 -; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq @@ -186,7 +184,6 @@ ; SKX-LABEL: test13: ; SKX: # BB#0: # %entry ; SKX-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: andl $1, %edi ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq @@ -194,10 +191,10 @@ ; KNL-LABEL: test13: ; KNL: # BB#0: # %entry ; KNL-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq + entry: %sub.i = fsub <2 x double> , %a %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4) Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -300,8 +300,8 @@ ; ; KNL_32-LABEL: test6: ; KNL_32: # BB#0: -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 +; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} @@ -1575,7 +1575,7 @@ ; Check non-power-of-2 case. It should be scalarized. declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { -; ALL-LABEL: test30: +; ALL-LABEL: test30 ; ALL-NOT: gather %sext_ind = sext <3 x i32> %ind to <3 x i64> @@ -1691,12 +1691,12 @@ ; KNL_32-LABEL: test_gather_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi4: +; KNL_32-NEXT: .Lcfi0: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi5: +; KNL_32-NEXT: .Lcfi1: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi6: +; KNL_32-NEXT: .Lcfi2: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1814,12 +1814,12 @@ ; KNL_32-LABEL: test_gather_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi7: +; KNL_32-NEXT: .Lcfi3: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi8: +; KNL_32-NEXT: .Lcfi4: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi9: +; KNL_32-NEXT: .Lcfi5: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1936,12 +1936,12 @@ ; KNL_32-LABEL: test_scatter_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi10: +; KNL_32-NEXT: .Lcfi6: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi11: +; KNL_32-NEXT: .Lcfi7: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi12: +; KNL_32-NEXT: .Lcfi8: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -2058,12 +2058,12 @@ ; KNL_32-LABEL: test_scatter_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi13: +; KNL_32-NEXT: .Lcfi9: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi14: +; KNL_32-NEXT: .Lcfi10: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi15: +; KNL_32-NEXT: .Lcfi11: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -2139,12 +2139,12 @@ ; KNL_32-LABEL: test_pr28312: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi16: +; KNL_32-NEXT: .Lcfi12: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi17: +; KNL_32-NEXT: .Lcfi13: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi18: +; KNL_32-NEXT: .Lcfi14: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-32, %esp ; KNL_32-NEXT: subl $32, %esp Index: test/CodeGen/X86/pr27591.ll =================================================================== --- test/CodeGen/X86/pr27591.ll +++ test/CodeGen/X86/pr27591.ll @@ -9,12 +9,6 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: # implicit-def: %EDI -; CHECK-NEXT: movb %al, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovd %edi, %k0 -; CHECK-NEXT: kmovd %k0, %edi -; CHECK-NEXT: movb %dil, %al ; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movzbl %al, %edi ; CHECK-NEXT: callq callee1 @@ -32,17 +26,9 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: # implicit-def: %EDI -; CHECK-NEXT: movb %al, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovd %edi, %k0 -; CHECK-NEXT: kmovd %k0, %edi +; CHECK-NEXT: movzbl %al, %edi ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movb %dil, %al -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: movl $-1, %ecx -; CHECK-NEXT: cmovnel %ecx, %edi +; CHECK-NEXT: negl %edi ; CHECK-NEXT: callq callee2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq Index: test/CodeGen/X86/pr28173.ll =================================================================== --- test/CodeGen/X86/pr28173.ll +++ test/CodeGen/X86/pr28173.ll @@ -8,9 +8,8 @@ define i64 @foo64(i1 zeroext %i) #0 { ; CHECK-LABEL: foo64: ; CHECK: # BB#0: -; CHECK-NEXT: # kill: %EDI %EDI %RDI -; CHECK-NEXT: orq $-2, %rdi -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orq $-2, %rax ; CHECK-NEXT: retq br label %bb @@ -26,8 +25,9 @@ define i16 @foo16(i1 zeroext %i) #0 { ; CHECK-LABEL: foo16: ; CHECK: # BB#0: -; CHECK-NEXT: orl $65534, %edi # imm = 0xFFFE -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orl $65534, %eax # imm = 0xFFFE +; CHECK-NEXT: # kill: %AX %AX %EAX ; CHECK-NEXT: retq br label %bb @@ -43,9 +43,9 @@ define i16 @foo16_1(i1 zeroext %i, i32 %j) #0 { ; CHECK-LABEL: foo16_1: ; CHECK: # BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: orl $2, %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orl $2, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX ; CHECK-NEXT: retq br label %bb @@ -61,8 +61,8 @@ define i32 @foo32(i1 zeroext %i) #0 { ; CHECK-LABEL: foo32: ; CHECK: # BB#0: -; CHECK-NEXT: orl $-2, %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orl $-2, %eax ; CHECK-NEXT: retq br label %bb Index: test/CodeGen/X86/pr32241.ll =================================================================== --- test/CodeGen/X86/pr32241.ll +++ test/CodeGen/X86/pr32241.ll @@ -4,49 +4,57 @@ define i32 @_Z3foov() { ; CHECK-LABEL: _Z3foov: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $20, %esp +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: .Lcfi0: -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: subl $24, %esp +; CHECK-NEXT: .Lcfi1: +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .Lcfi2: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: movw $10959, {{[0-9]+}}(%esp) # imm = 0x2ACF ; CHECK-NEXT: movw $-15498, {{[0-9]+}}(%esp) # imm = 0xC376 ; CHECK-NEXT: movw $19417, {{[0-9]+}}(%esp) # imm = 0x4BD9 -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx -; CHECK-NEXT: kxnorw %k0, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: testw %cx, %cx -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpw $0, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: jmp .LBB0_1 -; CHECK-NEXT: .LBB0_1: # %lor.rhs +; CHECK-NEXT: # BB#1: # %lor.rhs ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_2: # %lor.end -; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload -; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: kmovw %k1, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al # 1-byte Reload +; CHECK-NEXT: movb $1, %cl +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload +; CHECK-NEXT: subl %edx, %esi +; CHECK-NEXT: setl %al +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: xorl $-1, %edx +; CHECK-NEXT: cmpl $0, %edx +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_4 -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_3: # %lor.rhs4 +; CHECK-NEXT: # BB#3: # %lor.rhs4 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jmp .LBB0_4 ; CHECK-NEXT: .LBB0_4: # %lor.end5 -; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movw %ax, %cx -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al # 1-byte Reload +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: movw %cx, %dx +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl $20, %esp +; CHECK-NEXT: addl $24, %esp +; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl entry: %aa = alloca i16, align 2 Index: test/CodeGen/X86/pr32256.ll =================================================================== --- test/CodeGen/X86/pr32256.ll +++ test/CodeGen/X86/pr32256.ll @@ -7,39 +7,27 @@ define void @_Z1av() { ; CHECK-LABEL: _Z1av: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $6, %esp +; CHECK-NEXT: subl $2, %esp ; CHECK-NEXT: .Lcfi0: -; CHECK-NEXT: .cfi_def_cfa_offset 10 +; CHECK-NEXT: .cfi_def_cfa_offset 6 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: movb c, %cl -; CHECK-NEXT: # implicit-def: %EAX -; CHECK-NEXT: movb %cl, %al -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: kmovq %k1, %k2 -; CHECK-NEXT: kxnorw %k0, %k0, %k3 -; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kxorw %k3, %k1, %k1 -; CHECK-NEXT: kmovd %k1, %eax ; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: kmovw %k2, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill +; CHECK-NEXT: movb c, %dl +; CHECK-NEXT: xorb $-1, %dl +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_1: # %land.rhs ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill +; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_2: # %land.end -; CHECK-NEXT: kmovw (%esp), %k0 # 2-byte Reload -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) -; CHECK-NEXT: addl $6, %esp +; CHECK-NEXT: movb (%esp), %al # 1-byte Reload +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) +; CHECK-NEXT: addl $2, %esp ; CHECK-NEXT: retl entry: %b = alloca i8, align 1 Index: test/CodeGen/X86/pr32284.ll =================================================================== --- test/CodeGen/X86/pr32284.ll +++ test/CodeGen/X86/pr32284.ll @@ -40,12 +40,6 @@ ; X86-O0-NEXT: movzbl %cl, %edx ; X86-O0-NEXT: subl %eax, %edx ; X86-O0-NEXT: setle %cl -; X86-O0-NEXT: # implicit-def: %EAX -; X86-O0-NEXT: movb %cl, %al -; X86-O0-NEXT: andl $1, %eax -; X86-O0-NEXT: kmovd %eax, %k0 -; X86-O0-NEXT: kmovd %k0, %eax -; X86-O0-NEXT: movb %al, %cl ; X86-O0-NEXT: andb $1, %cl ; X86-O0-NEXT: movzbl %cl, %eax ; X86-O0-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -80,12 +74,6 @@ ; X64-O0-NEXT: movzbl %sil, %edi ; X64-O0-NEXT: subl %eax, %edi ; X64-O0-NEXT: setle %dl -; X64-O0-NEXT: # implicit-def: %EAX -; X64-O0-NEXT: movb %dl, %al -; X64-O0-NEXT: andl $1, %eax -; X64-O0-NEXT: kmovd %eax, %k0 -; X64-O0-NEXT: kmovd %k0, %eax -; X64-O0-NEXT: movb %al, %dl ; X64-O0-NEXT: andb $1, %dl ; X64-O0-NEXT: movzbl %dl, %eax ; X64-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp) Index: test/CodeGen/X86/pr32451.ll =================================================================== --- test/CodeGen/X86/pr32451.ll +++ test/CodeGen/X86/pr32451.ll @@ -25,12 +25,6 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload ; CHECK-NEXT: movl 4(%ecx), %edx ; CHECK-NEXT: movb (%edx), %bl -; CHECK-NEXT: # implicit-def: %EDX -; CHECK-NEXT: movb %bl, %dl -; CHECK-NEXT: andl $1, %edx -; CHECK-NEXT: kmovw %edx, %k0 -; CHECK-NEXT: kmovw %k0, %edx -; CHECK-NEXT: movb %dl, %bl ; CHECK-NEXT: andb $1, %bl ; CHECK-NEXT: movzbl %bl, %edx ; CHECK-NEXT: movl %edx, (%esp) Index: test/CodeGen/X86/sse-scalar-fp-arith.ll =================================================================== --- test/CodeGen/X86/sse-scalar-fp-arith.ll +++ test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1119,9 +1119,9 @@ ; ; AVX512-LABEL: add_ss_mask: ; AVX512: # BB#0: -; AVX512-NEXT: andl $1, %edi +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovaps %xmm2, %xmm0 ; AVX512-NEXT: retq %1 = extractelement <4 x float> %a, i64 0 @@ -1174,9 +1174,9 @@ ; ; AVX512-LABEL: add_sd_mask: ; AVX512: # BB#0: -; AVX512-NEXT: andl $1, %edi +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovapd %xmm2, %xmm0 ; AVX512-NEXT: retq %1 = extractelement <2 x double> %a, i64 0 Index: test/CodeGen/X86/xmulo.ll =================================================================== --- test/CodeGen/X86/xmulo.ll +++ test/CodeGen/X86/xmulo.ll @@ -712,17 +712,11 @@ ; ; KNL-LABEL: bug27873: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %esi ; KNL-NEXT: movl $160, %ecx ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: mulq %rcx -; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: seto %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: orb %sil, %al ; KNL-NEXT: retq %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160) %mul.overflow = extractvalue { i64, i1 } %mul, 1 Index: test/CodeGen/X86/xor-select-i1-combine.ll =================================================================== --- test/CodeGen/X86/xor-select-i1-combine.ll +++ test/CodeGen/X86/xor-select-i1-combine.ll @@ -7,10 +7,10 @@ define i32 @main(i8 %small) { ; CHECK-LABEL: main: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $n, %eax -; CHECK-NEXT: movl $m, %ecx ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: cmovneq %rax, %rcx +; CHECK-NEXT: movl $m, %eax +; CHECK-NEXT: movl $n, %ecx +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: movl (%rcx), %eax ; CHECK-NEXT: retq entry: