Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -9913,11 +9913,7 @@ SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); - // We have to cast V2 around. - MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, - DAG.getBitcast(MaskVT, V1Mask), - DAG.getBitcast(MaskVT, V2))); + V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } @@ -35052,8 +35048,8 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); - EVT VT = N->getValueType(0); - if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) + MVT VT = N->getSimpleValueType(0); + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); SDValue X, Y; @@ -35399,27 +35395,6 @@ return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } -// This promotes vectors and/or/xor to a vXi64 type. We used to do this during -// op legalization, but DAG combine yields better results. -// TODO: This is largely just to reduce the number of isel patterns. Maybe we -// can just add all the patterns or do C++ based selection in X86ISelDAGToDAG? -static SDValue promoteVecLogicOp(SDNode *N, SelectionDAG &DAG) { - MVT VT = N->getSimpleValueType(0); - - if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) - return SDValue(); - - // Already correct type. - if (VT.getVectorElementType() == MVT::i64) - return SDValue(); - - MVT NewVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDValue Op0 = DAG.getBitcast(NewVT, N->getOperand(0)); - SDValue Op1 = DAG.getBitcast(NewVT, N->getOperand(1)); - return DAG.getBitcast(VT, DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, - Op0, Op1)); -} - static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35454,9 +35429,6 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue V = promoteVecLogicOp(N, DAG)) - return V; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -35644,7 +35616,7 @@ if (!Subtarget.hasSSE41()) return SDValue(); - MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); @@ -35779,9 +35751,6 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue V = promoteVecLogicOp(N, DAG)) - return V; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -37757,7 +37726,9 @@ if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) { SDLoc dl(N); - MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + unsigned IntBits = std::min(VT.getScalarSizeInBits(), 64U); + MVT IntSVT = MVT::getIntegerVT(IntBits); + MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits); SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); @@ -37810,9 +37781,6 @@ if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue V = promoteVecLogicOp(N, DAG)) - return V; - if (SDValue SetCC = foldXor1SetCC(N, DAG)) return SetCC; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -102,10 +102,6 @@ RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); - // A vector tye of the same width with element type i64. This is used to - // create patterns for logic ops. - ValueType i64VT = !cast("v" # !srl(Size, 6) # "i64"); - // A vector type of the same width with element type i32. This is used to // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast("v" # !srl(Size, 5) # "i32"); @@ -5094,96 +5090,217 @@ // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -// OpNodeMsk is the OpNode to use when element size is important. OpNode will -// be set to null_frag for 32-bit elements. -multiclass avx512_logic_rm opc, string OpcodeStr, - SDPatternOperator OpNode, - SDNode OpNodeMsk, X86FoldableSchedWrite sched, - X86VectorVTInfo _, bit IsCommutable = 0> { - let hasSideEffects = 0 in - defm rr : AVX512_maskable_logic, AVX512BIBase, EVEX_4V, - Sched<[sched]>; - - let hasSideEffects = 0, mayLoad = 1 in - defm rm : AVX512_maskable_logic, - AVX512BIBase, EVEX_4V, - Sched<[sched.Folded, sched.ReadAfterFold]>; -} +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SchedWriteVecLogic, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SchedWriteVecLogic, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SchedWriteVecLogic, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SchedWriteVecLogic, HasAVX512>; -// OpNodeMsk is the OpNode to use where element size is important. So use -// for all of the broadcast patterns. -multiclass avx512_logic_rmb opc, string OpcodeStr, - SDPatternOperator OpNode, - SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _, - bit IsCommutable = 0> : - avx512_logic_rm { - defm rmb : AVX512_maskable_logic, - AVX512BIBase, EVEX_4V, EVEX_B, - Sched<[sched.Folded, sched.ReadAfterFold]>; +let Predicates = [HasVLX] in { + def : Pat<(v16i8 (and VR128X:$src1, VR128X:$src2)), + (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>; + def : Pat<(v8i16 (and VR128X:$src1, VR128X:$src2)), + (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>; + + def : Pat<(v16i8 (or VR128X:$src1, VR128X:$src2)), + (VPORQZ128rr VR128X:$src1, VR128X:$src2)>; + def : Pat<(v8i16 (or VR128X:$src1, VR128X:$src2)), + (VPORQZ128rr VR128X:$src1, VR128X:$src2)>; + + def : Pat<(v16i8 (xor VR128X:$src1, VR128X:$src2)), + (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>; + def : Pat<(v8i16 (xor VR128X:$src1, VR128X:$src2)), + (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>; + + def : Pat<(v16i8 (X86andnp VR128X:$src1, VR128X:$src2)), + (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>; + def : Pat<(v8i16 (X86andnp VR128X:$src1, VR128X:$src2)), + (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>; + + def : Pat<(and VR128X:$src1, (loadv16i8 addr:$src2)), + (VPANDQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(and VR128X:$src1, (loadv8i16 addr:$src2)), + (VPANDQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(or VR128X:$src1, (loadv16i8 addr:$src2)), + (VPORQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(or VR128X:$src1, (loadv8i16 addr:$src2)), + (VPORQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(xor VR128X:$src1, (loadv16i8 addr:$src2)), + (VPXORQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(xor VR128X:$src1, (loadv8i16 addr:$src2)), + (VPXORQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128X:$src1, (loadv16i8 addr:$src2)), + (VPANDNQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)), + (VPANDNQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(and VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDDZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(or VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPORDZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(xor VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPXORDZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>; + + def : Pat<(and VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDQZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(or VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPORQZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(xor VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPXORQZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>; + + def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)), + (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; + def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)), + (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; + + def : Pat<(v32i8 (or VR256X:$src1, VR256X:$src2)), + (VPORQZ256rr VR256X:$src1, VR256X:$src2)>; + def : Pat<(v16i16 (or VR256X:$src1, VR256X:$src2)), + (VPORQZ256rr VR256X:$src1, VR256X:$src2)>; + + def : Pat<(v32i8 (xor VR256X:$src1, VR256X:$src2)), + (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>; + def : Pat<(v16i16 (xor VR256X:$src1, VR256X:$src2)), + (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>; + + def : Pat<(v32i8 (X86andnp VR256X:$src1, VR256X:$src2)), + (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>; + def : Pat<(v16i16 (X86andnp VR256X:$src1, VR256X:$src2)), + (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>; + + def : Pat<(and VR256X:$src1, (loadv32i8 addr:$src2)), + (VPANDQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(and VR256X:$src1, (loadv16i16 addr:$src2)), + (VPANDQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(or VR256X:$src1, (loadv32i8 addr:$src2)), + (VPORQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(or VR256X:$src1, (loadv16i16 addr:$src2)), + (VPORQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(xor VR256X:$src1, (loadv32i8 addr:$src2)), + (VPXORQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(xor VR256X:$src1, (loadv16i16 addr:$src2)), + (VPXORQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256X:$src1, (loadv32i8 addr:$src2)), + (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)), + (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(and VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDDZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(or VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPORDZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(xor VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPXORDZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>; + + def : Pat<(and VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDQZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(or VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPORQZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(xor VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPXORQZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>; } -multiclass avx512_logic_rmb_vl opc, string OpcodeStr, - SDPatternOperator OpNode, - SDNode OpNodeMsk, X86SchedWriteWidths sched, - AVX512VLVectorVTInfo VTInfo, - bit IsCommutable = 0> { - let Predicates = [HasAVX512] in - defm Z : avx512_logic_rmb, EVEX_V512; - - let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_logic_rmb, EVEX_V256; - defm Z128 : avx512_logic_rmb, EVEX_V128; - } +let Predicates = [HasAVX512] in { + def : Pat<(v64i8 (and VR512:$src1, VR512:$src2)), + (VPANDQZrr VR512:$src1, VR512:$src2)>; + def : Pat<(v32i16 (and VR512:$src1, VR512:$src2)), + (VPANDQZrr VR512:$src1, VR512:$src2)>; + + def : Pat<(v64i8 (or VR512:$src1, VR512:$src2)), + (VPORQZrr VR512:$src1, VR512:$src2)>; + def : Pat<(v32i16 (or VR512:$src1, VR512:$src2)), + (VPORQZrr VR512:$src1, VR512:$src2)>; + + def : Pat<(v64i8 (xor VR512:$src1, VR512:$src2)), + (VPXORQZrr VR512:$src1, VR512:$src2)>; + def : Pat<(v32i16 (xor VR512:$src1, VR512:$src2)), + (VPXORQZrr VR512:$src1, VR512:$src2)>; + + def : Pat<(v64i8 (X86andnp VR512:$src1, VR512:$src2)), + (VPANDNQZrr VR512:$src1, VR512:$src2)>; + def : Pat<(v32i16 (X86andnp VR512:$src1, VR512:$src2)), + (VPANDNQZrr VR512:$src1, VR512:$src2)>; + + def : Pat<(and VR512:$src1, (loadv64i8 addr:$src2)), + (VPANDQZrm VR512:$src1, addr:$src2)>; + def : Pat<(and VR512:$src1, (loadv32i16 addr:$src2)), + (VPANDQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(or VR512:$src1, (loadv64i8 addr:$src2)), + (VPORQZrm VR512:$src1, addr:$src2)>; + def : Pat<(or VR512:$src1, (loadv32i16 addr:$src2)), + (VPORQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(xor VR512:$src1, (loadv64i8 addr:$src2)), + (VPXORQZrm VR512:$src1, addr:$src2)>; + def : Pat<(xor VR512:$src1, (loadv32i16 addr:$src2)), + (VPXORQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR512:$src1, (loadv64i8 addr:$src2)), + (VPANDNQZrm VR512:$src1, addr:$src2)>; + def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)), + (VPANDNQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(and VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDDZrmb VR512:$src1, addr:$src2)>; + def : Pat<(or VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPORDZrmb VR512:$src1, addr:$src2)>; + def : Pat<(xor VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPXORDZrmb VR512:$src1, addr:$src2)>; + def : Pat<(X86andnp VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDNDZrmb VR512:$src1, addr:$src2)>; + + def : Pat<(and VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDQZrmb VR512:$src1, addr:$src2)>; + def : Pat<(or VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPORQZrmb VR512:$src1, addr:$src2)>; + def : Pat<(xor VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPXORQZrmb VR512:$src1, addr:$src2)>; + def : Pat<(X86andnp VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDNQZrmb VR512:$src1, addr:$src2)>; } -multiclass avx512_logic_rm_vl_dq opc_d, bits<8> opc_q, string OpcodeStr, - SDNode OpNode, X86SchedWriteWidths sched, - bit IsCommutable = 0> { - defm Q : avx512_logic_rmb_vl, - VEX_W, EVEX_CD8<64, CD8VF>; - defm D : avx512_logic_rmb_vl, - EVEX_CD8<32, CD8VF>; -} - -defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, - SchedWriteVecLogic, 1>; -defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, - SchedWriteVecLogic, 1>; -defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, - SchedWriteVecLogic, 1>; -defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, - SchedWriteVecLogic>; - //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic //===----------------------------------------------------------------------===// @@ -5487,73 +5604,6 @@ defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; -// Patterns catch floating point selects with bitcasted integer logic ops. -multiclass avx512_fp_logical_lowering { -let Predicates = [prd] in { - // Masked register-register logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), - _.RC:$src0)), - (!cast(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask, - _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (!cast(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1, - _.RC:$src2)>; - // Masked register-memory logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, - (load addr:$src2)))), - _.RC:$src0)), - (!cast(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask, - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))), - _.ImmAllZerosV)), - (!cast(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1, - addr:$src2)>; - // Register-broadcast logical operations. - def : Pat<(_.i64VT (OpNode _.RC:$src1, - (bitconvert (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), - (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert - (_.i64VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), - _.RC:$src0)), - (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert - (_.i64VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), - _.ImmAllZerosV)), - (!cast(InstrStr#rmbkz) _.KRCWM:$mask, - _.RC:$src1, addr:$src2)>; -} -} - -multiclass avx512_fp_logical_lowering_sizes { - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; -} - -defm : avx512_fp_logical_lowering_sizes<"VPAND", and>; -defm : avx512_fp_logical_lowering_sizes<"VPOR", or>; -defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>; -defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>; - let Predicates = [HasVLX,HasDQI] in { // Use packed logical operations for scalar ops. def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), @@ -5683,15 +5733,12 @@ defm rr : AVX512_maskable_cmp, + (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable_cmp, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -5725,7 +5772,7 @@ // Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_vptest_lowering { - def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), (_.KVT (COPY_TO_REGCLASS (!cast(Name # "Zrr") @@ -5736,7 +5783,7 @@ _.KRC))>; def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV))), (COPY_TO_REGCLASS (!cast(Name # "Zrrk") @@ -11355,19 +11402,68 @@ // TODO: We should maybe have a more generalized algorithm for folding to // vpternlog. let Predicates = [HasAVX512] in { - def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))), + def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; + def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; + def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; + def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } let Predicates = [HasAVX512, NoVLX] in { - def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))), + + def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; + def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; + def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; + def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), @@ -11377,9 +11473,22 @@ } let Predicates = [HasVLX] in { - def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + + def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; + def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; + def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; + def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; } Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -855,6 +855,7 @@ // 512-bit bitconvert pattern fragments def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>; +def bc_v32i16 : PatFrag<(ops node:$in), (v32i16 (bitconvert node:$in))>; def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -2389,24 +2389,136 @@ let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), + (VPANDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), + (VPANDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), + (VPANDYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), + (VPORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), + (VPORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), + (VPORYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), + (VPXORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), + (VPXORYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), + (VPXORYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), + (VPANDNYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), + (VPANDNYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), + (VPANDNYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + + def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + + def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; +} + // If only AVX1 is supported, we need to handle integer operations with // floating point instructions since the integer versions aren't available. let Predicates = [HasAVX1Only] in { + def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), (VANDPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), (VORPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), (VXORPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), (VANDPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), (VORPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), (VXORPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), (VANDNPSYrm VR256:$src1, addr:$src2)>; } @@ -2504,6 +2616,122 @@ FR64)>; } +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), + (VPANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), + (VPANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), + (VPANDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), + (VPORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), + (VPORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), + (VPORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), + (VPXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), + (VPXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), + (VPXORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), + (VPANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), + (VPANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), + (VPANDNrr VR128:$src1, VR128:$src2)>; + + def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + + def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + + def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), + (PANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), + (PANDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), + (PANDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), + (PORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), + (PORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), + (PORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), + (PXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), + (PXORrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), + (PXORrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>; + + def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + + def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + + def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; +} + // Patterns for packed operations when we don't have integer type available. def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), (ANDPSrr VR128:$src1, VR128:$src2)>; Index: lib/Target/X86/X86InstrXOP.td =================================================================== --- lib/Target/X86/X86InstrXOP.td +++ lib/Target/X86/X86InstrXOP.td @@ -350,6 +350,7 @@ [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V, Sched<[sched]>; + // FIXME: This pattern can't match. def rrm : IXOPi8Reg, VEX_L; } +let Predicates = [HasXOP] in { + def : Pat<(v16i8 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v8i16 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))), + (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; + def : Pat<(or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))), + (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; + def : Pat<(or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; + + def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; + def : Pat<(v16i16 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; + def : Pat<(v8i32 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; + + def : Pat<(or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))), + (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; + def : Pat<(or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))), + (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; + def : Pat<(or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; +} + multiclass xop_vpermil2 Opc, string OpcodeStr, RegisterClass RC, X86MemOperand intmemop, X86MemOperand fpmemop, ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag, Index: test/CodeGen/X86/avx-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -85,7 +85,10 @@ define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind { ; CHECK-LABEL: test_mm256_andnot_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <8 x float> %a0 to <8 x i32> %2 = bitcast <8 x float> %a1 to <8 x i32> Index: test/CodeGen/X86/avx-logic.ll =================================================================== --- test/CodeGen/X86/avx-logic.ll +++ test/CodeGen/X86/avx-logic.ll @@ -351,12 +351,22 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; INT256-LABEL: andn_disguised_i8_elts: -; INT256: # %bb.0: -; INT256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; INT256-NEXT: vpandn {{.*}}(%rip), %ymm0, %ymm0 -; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; INT256-NEXT: retq +; AVX2-LABEL: andn_disguised_i8_elts: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: andn_disguised_i8_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %add = add <8 x i32> %y, %x %neg = and <8 x i32> %add, %and = xor <8 x i32> %neg, @@ -401,11 +411,21 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; INT256-LABEL: andn_constant_mask_operand_no_concat: -; INT256: # %bb.0: -; INT256-NEXT: vpandn {{.*}}(%rip), %ymm0, %ymm0 -; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: retq +; AVX2-LABEL: andn_constant_mask_operand_no_concat: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: andn_constant_mask_operand_no_concat: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %xor = xor <8 x i32> %x, %and = and <8 x i32> %xor, %r = add <8 x i32> %and, %y Index: test/CodeGen/X86/avx512-arith.ll =================================================================== --- test/CodeGen/X86/avx512-arith.ll +++ test/CodeGen/X86/avx512-arith.ll @@ -601,17 +601,17 @@ define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; AVX512F-LABEL: andd512fold: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: andd512fold: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: andd512fold: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: andd512fold: Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -361,7 +361,8 @@ define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) { ; CHECK-LABEL: test_mm512_testn_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vzeroupper @@ -378,7 +379,8 @@ ; X86-LABEL: test_mm512_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: vzeroupper @@ -386,8 +388,9 @@ ; ; X64-LABEL: test_mm512_mask_testn_epi32_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: vzeroupper @@ -449,7 +452,8 @@ ; X86-LABEL: test_mm512_mask_test_epi32_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: vzeroupper @@ -457,8 +461,9 @@ ; ; X64-LABEL: test_mm512_mask_test_epi32_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: vzeroupper @@ -3614,8 +3619,8 @@ ; CHECK-LABEL: test_mm512_fnmsub_round_ps: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] -; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 -; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4 +; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: @@ -3837,8 +3842,8 @@ ; CHECK-LABEL: test_mm512_fnmsub_ps: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] -; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 -; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4 +; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1658,7 +1658,7 @@ define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_xor_epi32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1] +; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res @@ -1687,7 +1687,7 @@ define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_or_epi32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1] +; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res @@ -1716,7 +1716,7 @@ define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_and_epi32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1] +; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res Index: test/CodeGen/X86/avx512-logic.ll =================================================================== --- test/CodeGen/X86/avx512-logic.ll +++ test/CodeGen/X86/avx512-logic.ll @@ -7,7 +7,7 @@ ; ALL-LABEL: vpandd: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -21,7 +21,7 @@ ; ALL-LABEL: vpandnd: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpandnd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -37,7 +37,7 @@ ; ALL-LABEL: vpord: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -51,7 +51,7 @@ ; ALL-LABEL: vpxord: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -132,7 +132,7 @@ define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; KNL-LABEL: andd512fold: ; KNL: ## %bb.0: ## %entry -; KNL-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; KNL-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andd512fold: @@ -442,14 +442,16 @@ define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; KNL-LABEL: test_mm512_mask_and_epi32: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_and_epi32: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %and1.i.i = and <8 x i64> %__a, %__b @@ -464,14 +466,16 @@ define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; KNL-LABEL: test_mm512_mask_or_epi32: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_or_epi32: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vporq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %or1.i.i = or <8 x i64> %__a, %__b @@ -486,14 +490,16 @@ define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; KNL-LABEL: test_mm512_mask_xor_epi32: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpxorq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_xor_epi32: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vpxorq %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %xor1.i.i = xor <8 x i64> %__a, %__b @@ -508,14 +514,16 @@ define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_mask_xor_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpxorq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxorq %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_xor_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -530,14 +538,16 @@ define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_maskz_xor_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_xor_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -552,14 +562,16 @@ define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_mask_xor_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpxord %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_xor_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -574,14 +586,16 @@ define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_maskz_xor_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_xor_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -596,14 +610,16 @@ define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_mask_or_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vporq %zmm1, %zmm2, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_or_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -618,14 +634,16 @@ define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_maskz_or_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vporq %zmm0, %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_or_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -640,14 +658,16 @@ define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_mask_or_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpord %zmm1, %zmm2, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_or_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorps %zmm1, %zmm2, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -662,14 +682,16 @@ define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_maskz_or_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpord %zmm0, %zmm1, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %zmm0, %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_or_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -684,14 +706,16 @@ define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_mask_and_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandq %zmm1, %zmm2, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandq %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_and_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -706,14 +730,16 @@ define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_maskz_and_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_and_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -728,14 +754,16 @@ define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_mask_and_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandd %zmm1, %zmm2, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_and_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandps %zmm1, %zmm2, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -750,14 +778,16 @@ define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_maskz_and_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandd %zmm0, %zmm1, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %zmm0, %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_and_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -772,14 +802,16 @@ define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_mask_andnot_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandnq %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnq %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_andnot_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -795,14 +827,16 @@ define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; KNL-LABEL: test_mm512_maskz_andnot_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnq %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_andnot_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -818,14 +852,16 @@ define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_mask_andnot_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandnd %zmm2, %zmm1, %zmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnd %zmm2, %zmm1, %zmm0 {%k1} +; KNL-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_mask_andnot_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -841,14 +877,16 @@ define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; KNL-LABEL: test_mm512_maskz_andnot_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnd %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm512_maskz_andnot_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -3177,7 +3177,7 @@ ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testw %ax, %ax @@ -3196,7 +3196,7 @@ ; SKX: ## %bb.0: ; SKX-NEXT: pushq %rax ; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpord %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testw %ax, %ax @@ -3215,7 +3215,7 @@ ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: .cfi_def_cfa_offset 16 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testw %ax, %ax @@ -3234,7 +3234,7 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax ; AVX512DQ-NEXT: testw %ax, %ax @@ -3253,7 +3253,7 @@ ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: testw %ax, %ax @@ -3287,7 +3287,7 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kortestw %k0, %k0 ; CHECK-NEXT: jb LBB65_2 @@ -3303,7 +3303,7 @@ ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kortestw %k0, %k0 ; X86-NEXT: jb LBB65_2 @@ -3448,8 +3448,9 @@ define void @mask_not_cast(i8*, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>) { ; CHECK-LABEL: mask_not_cast: ; CHECK: ## %bb.0: +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm1 ; CHECK-NEXT: vpcmpnleud %zmm3, %zmm2, %k1 -; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k1 {%k1} +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} ; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3457,8 +3458,9 @@ ; X86-LABEL: mask_not_cast: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm1 ; X86-NEXT: vpcmpnleud %zmm3, %zmm2, %k1 -; X86-NEXT: vptestmd %zmm0, %zmm1, %k1 {%k1} +; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} ; X86-NEXT: vmovdqu32 %zmm0, (%eax) {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -5029,13 +5029,13 @@ ; GENERIC-LABEL: vpandd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5049,13 +5049,13 @@ ; GENERIC-LABEL: vpandnd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandnd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5071,13 +5071,13 @@ ; GENERIC-LABEL: vpord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpord: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5091,13 +5091,13 @@ ; GENERIC-LABEL: vpxord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpxord: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5455,14 +5455,16 @@ define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_and_epi32: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vpandq %zmm2, %zmm1, %zmm1 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_and_epi32: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vpandq %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %and1.i.i = and <8 x i64> %__a, %__b @@ -5477,14 +5479,16 @@ define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_or_epi32: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vporq %zmm2, %zmm1, %zmm1 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_or_epi32: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vporq %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %or1.i.i = or <8 x i64> %__a, %__b @@ -5499,14 +5503,16 @@ define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_xor_epi32: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vpxorq %zmm2, %zmm1, %zmm1 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_xor_epi32: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vpxorq %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %xor1.i.i = xor <8 x i64> %__a, %__b @@ -5521,14 +5527,16 @@ define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_xor_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vxorpd %zmm2, %zmm1, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_xor_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5543,14 +5551,16 @@ define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_xor_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vxorpd %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_xor_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5565,14 +5575,16 @@ define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_xor_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_xor_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5587,14 +5599,16 @@ define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_xor_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_xor_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5609,14 +5623,16 @@ define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_or_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vorpd %zmm1, %zmm2, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_or_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5631,14 +5647,16 @@ define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_or_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vorpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_or_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5653,14 +5671,16 @@ define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_or_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vorps %zmm1, %zmm2, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_or_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vorps %zmm1, %zmm2, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5675,14 +5695,16 @@ define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_or_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vorps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_or_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5697,14 +5719,16 @@ define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_and_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandpd %zmm1, %zmm2, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_and_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5719,14 +5743,16 @@ define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_and_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_and_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5741,14 +5767,16 @@ define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_and_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandps %zmm1, %zmm2, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_and_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandps %zmm1, %zmm2, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5763,14 +5791,16 @@ define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_and_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_and_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5785,14 +5815,16 @@ define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_andnot_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandnpd %zmm2, %zmm1, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_andnot_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5808,14 +5840,16 @@ define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_andnot_pd: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandnpd %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_andnot_pd: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -5831,14 +5865,16 @@ define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_andnot_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandnps %zmm2, %zmm1, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_andnot_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm1 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -5854,14 +5890,16 @@ define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_andnot_ps: ; GENERIC: # %bb.0: # %entry +; GENERIC-NEXT: vandnps %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_andnot_ps: ; SKX: # %bb.0: # %entry +; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.50] +; SKX-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> Index: test/CodeGen/X86/avx512-select.ll =================================================================== --- test/CodeGen/X86/avx512-select.ll +++ test/CodeGen/X86/avx512-select.ll @@ -11,7 +11,7 @@ ; X86-NEXT: # %bb.1: ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 ; X86-NEXT: .LBB0_2: -; X86-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: select00: @@ -22,7 +22,7 @@ ; X64-NEXT: # %bb.1: ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 ; X64-NEXT: .LBB0_2: -; X64-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; X64-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b Index: test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -562,7 +562,8 @@ define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_test_epi8_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestmb %zmm0, %zmm1, %k0 +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestmb %zmm0, %zmm0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: kmovd %k1, %edx @@ -571,7 +572,8 @@ ; ; X64-LABEL: test_mm512_test_epi8_mask: ; X64: # %bb.0: # %entry -; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X64-NEXT: vptestmb %zmm0, %zmm0, %k0 ; X64-NEXT: kmovq %k0, %rax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -586,7 +588,8 @@ define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_test_epi8_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestmb %zmm0, %zmm1, %k0 +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestmb %zmm0, %zmm0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kmovd %k1, %edx ; X86-NEXT: kmovd %k0, %eax @@ -597,8 +600,9 @@ ; ; X64-LABEL: test_mm512_mask_test_epi8_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; X64-NEXT: kmovq %rdi, %k1 -; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: vptestmb %zmm0, %zmm0, %k0 {%k1} ; X64-NEXT: kmovq %k0, %rax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -615,7 +619,8 @@ define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { ; CHECK-LABEL: test_mm512_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} @@ -631,15 +636,17 @@ ; X86-LABEL: test_mm512_mask_test_epi16_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestmw %zmm0, %zmm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_mask_test_epi16_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: vptestmw %zmm0, %zmm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -656,7 +663,8 @@ define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_testn_epi8_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: kmovd %k1, %edx @@ -665,7 +673,8 @@ ; ; X64-LABEL: test_mm512_testn_epi8_mask: ; X64: # %bb.0: # %entry -; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X64-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; X64-NEXT: kmovq %k0, %rax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -680,7 +689,8 @@ define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) { ; X86-LABEL: test_mm512_mask_testn_epi8_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kmovd %k1, %edx ; X86-NEXT: kmovd %k0, %eax @@ -691,8 +701,9 @@ ; ; X64-LABEL: test_mm512_mask_testn_epi8_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; X64-NEXT: kmovq %rdi, %k1 -; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: vptestnmb %zmm0, %zmm0, %k0 {%k1} ; X64-NEXT: kmovq %k0, %rax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -709,7 +720,8 @@ define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) { ; CHECK-LABEL: test_mm512_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} @@ -725,15 +737,17 @@ ; X86-LABEL: test_mm512_mask_testn_epi16_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} +; X86-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; X86-NEXT: vptestnmw %zmm0, %zmm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_mask_testn_epi16_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} +; X64-NEXT: vptestnmw %zmm0, %zmm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq Index: test/CodeGen/X86/avx512bw-vec-test-testn.ll =================================================================== --- test/CodeGen/X86/avx512bw-vec-test-testn.ll +++ test/CodeGen/X86/avx512bw-vec-test-testn.ll @@ -5,7 +5,8 @@ define zeroext i32 @TEST_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -22,7 +23,8 @@ define zeroext i64 @TEST_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -38,7 +40,8 @@ define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: vzeroupper @@ -57,7 +60,8 @@ define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: vzeroupper @@ -76,7 +80,8 @@ define zeroext i32 @TEST_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -93,7 +98,8 @@ define zeroext i64 @TEST_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -109,7 +115,8 @@ define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: vzeroupper @@ -128,7 +135,8 @@ define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: vzeroupper Index: test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll @@ -7,7 +7,8 @@ define zeroext i16 @test_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-LABEL: test_mm_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmb %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -23,15 +24,17 @@ ; X86-LABEL: test_mm_mask_test_epi8_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-NEXT: vptestmb %xmm0, %xmm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_test_epi8_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: vptestmb %xmm0, %xmm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq @@ -48,7 +51,8 @@ define i32 @test_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-LABEL: test_mm256_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmb %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} @@ -64,15 +68,17 @@ ; X86-LABEL: test_mm256_mask_test_epi8_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1} +; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86-NEXT: vptestmb %ymm0, %ymm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_test_epi8_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: vptestmb %ymm0, %ymm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -89,7 +95,8 @@ define zeroext i8 @test_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-LABEL: test_mm_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmw %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -105,16 +112,18 @@ ; X86-LABEL: test_mm_mask_test_epi16_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vptestmw %xmm0, %xmm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_test_epi16_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: vptestmw %xmm0, %xmm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq @@ -131,7 +140,8 @@ define zeroext i16 @test_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-LABEL: test_mm256_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmw %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vzeroupper @@ -148,7 +158,8 @@ ; X86-LABEL: test_mm256_mask_test_epi16_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1} +; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86-NEXT: vptestmw %ymm0, %ymm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: vzeroupper @@ -156,8 +167,9 @@ ; ; X64-LABEL: test_mm256_mask_test_epi16_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: vptestmw %ymm0, %ymm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: vzeroupper @@ -175,7 +187,8 @@ define zeroext i16 @test_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-LABEL: test_mm_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -191,15 +204,17 @@ ; X86-LABEL: test_mm_mask_testn_epi8_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-NEXT: vptestnmb %xmm0, %xmm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_testn_epi8_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: vptestnmb %xmm0, %xmm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq @@ -216,7 +231,8 @@ define i32 @test_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-LABEL: test_mm256_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} @@ -232,15 +248,17 @@ ; X86-LABEL: test_mm256_mask_testn_epi8_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1} +; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86-NEXT: vptestnmb %ymm0, %ymm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_testn_epi8_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: vptestnmb %ymm0, %ymm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -257,7 +275,8 @@ define zeroext i8 @test_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-LABEL: test_mm_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -273,16 +292,18 @@ ; X86-LABEL: test_mm_mask_testn_epi16_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vptestnmw %xmm0, %xmm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_testn_epi16_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: vptestnmw %xmm0, %xmm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq @@ -299,7 +320,8 @@ define zeroext i16 @test_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-LABEL: test_mm256_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vzeroupper @@ -316,7 +338,8 @@ ; X86-LABEL: test_mm256_mask_testn_epi16_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1} +; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86-NEXT: vptestnmw %ymm0, %ymm0, %k0 {%k1} ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: vzeroupper @@ -324,8 +347,9 @@ ; ; X64-LABEL: test_mm256_mask_testn_epi16_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: vptestnmw %ymm0, %ymm0, %k0 {%k1} ; X64-NEXT: kmovd %k0, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: vzeroupper Index: test/CodeGen/X86/avx512bwvl-vec-test-testn.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-vec-test-testn.ll +++ test/CodeGen/X86/avx512bwvl-vec-test-testn.ll @@ -5,7 +5,8 @@ define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmb %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq @@ -21,7 +22,8 @@ define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmb %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax @@ -40,7 +42,8 @@ define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmw %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq @@ -56,7 +59,8 @@ define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmw %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andb %dil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax @@ -75,7 +79,8 @@ define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq @@ -91,7 +96,8 @@ define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax @@ -110,7 +116,8 @@ define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq @@ -126,7 +133,8 @@ define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andb %dil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax @@ -145,7 +153,8 @@ define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmb %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -161,7 +170,8 @@ define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmb %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: vzeroupper @@ -180,7 +190,8 @@ define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmw %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: vzeroupper @@ -197,7 +208,8 @@ define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmw %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax @@ -217,7 +229,8 @@ define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -233,7 +246,8 @@ define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: vzeroupper @@ -252,7 +266,8 @@ define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: vzeroupper @@ -269,7 +284,8 @@ define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax Index: test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -14,16 +14,16 @@ define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_andnot_ps_rrk_128: ; X86: # %bb.0: +; X86-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrk_128: ; X64: # %bb.0: +; X64-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -32,14 +32,16 @@ define <4 x float> @test_mask_andnot_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_andnot_ps_rrkz_128: ; X86: # %bb.0: +; X86-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrkz_128: ; X64: # %bb.0: +; X64-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -65,16 +67,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmk_128: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -85,14 +87,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmkz_128: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -121,16 +125,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x55,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbk_128: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -143,14 +147,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x55,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbkz_128: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -173,16 +179,16 @@ define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_andnot_ps_rrk_256: ; X86: # %bb.0: +; X86-NEXT: vandnps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrk_256: ; X64: # %bb.0: +; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -191,14 +197,16 @@ define <8 x float> @test_mask_andnot_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_andnot_ps_rrkz_256: ; X86: # %bb.0: +; X86-NEXT: vandnps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrkz_256: ; X64: # %bb.0: +; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -224,16 +232,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmk_256: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -244,14 +252,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmkz_256: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -280,16 +290,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x55,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbk_256: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -302,14 +312,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x55,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbkz_256: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -332,16 +344,16 @@ define <16 x float> @test_mask_andnot_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) { ; X86-LABEL: test_mask_andnot_ps_rrk_512: ; X86: # %bb.0: +; X86-NEXT: vandnps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrk_512: ; X64: # %bb.0: +; X64-NEXT: vandnps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -350,14 +362,16 @@ define <16 x float> @test_mask_andnot_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) { ; X86-LABEL: test_mask_andnot_ps_rrkz_512: ; X86: # %bb.0: +; X86-NEXT: vandnps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrkz_512: ; X64: # %bb.0: +; X64-NEXT: vandnps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -383,16 +397,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmk_512: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -403,14 +417,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmkz_512: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -439,16 +455,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x55,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x55,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbk_512: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -461,14 +477,16 @@ ; X86-LABEL: test_mask_andnot_ps_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x55,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbkz_512: ; X64: # %bb.0: +; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -491,16 +509,16 @@ define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_and_ps_rrk_128: ; X86: # %bb.0: +; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrk_128: ; X64: # %bb.0: +; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -509,14 +527,16 @@ define <4 x float> @test_mask_and_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_and_ps_rrkz_128: ; X86: # %bb.0: +; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrkz_128: ; X64: # %bb.0: +; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -542,16 +562,16 @@ ; X86-LABEL: test_mask_and_ps_rmk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmk_128: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -562,14 +582,16 @@ ; X86-LABEL: test_mask_and_ps_rmkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmkz_128: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -598,16 +620,16 @@ ; X86-LABEL: test_mask_and_ps_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x54,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbk_128: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -620,14 +642,16 @@ ; X86-LABEL: test_mask_and_ps_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x54,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbkz_128: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -650,16 +674,16 @@ define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_and_ps_rrk_256: ; X86: # %bb.0: +; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrk_256: ; X64: # %bb.0: +; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -668,14 +692,16 @@ define <8 x float> @test_mask_and_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_and_ps_rrkz_256: ; X86: # %bb.0: +; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrkz_256: ; X64: # %bb.0: +; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -701,16 +727,16 @@ ; X86-LABEL: test_mask_and_ps_rmk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmk_256: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -721,14 +747,16 @@ ; X86-LABEL: test_mask_and_ps_rmkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmkz_256: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -757,16 +785,16 @@ ; X86-LABEL: test_mask_and_ps_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x54,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbk_256: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -779,14 +807,16 @@ ; X86-LABEL: test_mask_and_ps_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x54,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbkz_256: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -809,16 +839,16 @@ define <16 x float> @test_mask_and_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) { ; X86-LABEL: test_mask_and_ps_rrk_512: ; X86: # %bb.0: +; X86-NEXT: vandps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrk_512: ; X64: # %bb.0: +; X64-NEXT: vandps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -827,14 +857,16 @@ define <16 x float> @test_mask_and_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) { ; X86-LABEL: test_mask_and_ps_rrkz_512: ; X86: # %bb.0: +; X86-NEXT: vandps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrkz_512: ; X64: # %bb.0: +; X64-NEXT: vandps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -860,16 +892,16 @@ ; X86-LABEL: test_mask_and_ps_rmk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmk_512: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -880,14 +912,16 @@ ; X86-LABEL: test_mask_and_ps_rmkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmkz_512: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -916,16 +950,16 @@ ; X86-LABEL: test_mask_and_ps_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x54,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x54,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbk_512: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -938,14 +972,16 @@ ; X86-LABEL: test_mask_and_ps_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x54,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbkz_512: ; X64: # %bb.0: +; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -968,16 +1004,16 @@ define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_or_ps_rrk_128: ; X86: # %bb.0: +; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrk_128: ; X64: # %bb.0: +; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -986,14 +1022,16 @@ define <4 x float> @test_mask_or_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_or_ps_rrkz_128: ; X86: # %bb.0: +; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrkz_128: ; X64: # %bb.0: +; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -1019,16 +1057,16 @@ ; X86-LABEL: test_mask_or_ps_rmk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmk_128: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -1039,14 +1077,16 @@ ; X86-LABEL: test_mask_or_ps_rmkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmkz_128: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -1075,16 +1115,16 @@ ; X86-LABEL: test_mask_or_ps_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x56,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbk_128: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1097,14 +1137,16 @@ ; X86-LABEL: test_mask_or_ps_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x56,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbkz_128: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1127,16 +1169,16 @@ define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_or_ps_rrk_256: ; X86: # %bb.0: +; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrk_256: ; X64: # %bb.0: +; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -1145,14 +1187,16 @@ define <8 x float> @test_mask_or_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_or_ps_rrkz_256: ; X86: # %bb.0: +; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrkz_256: ; X64: # %bb.0: +; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -1178,16 +1222,16 @@ ; X86-LABEL: test_mask_or_ps_rmk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmk_256: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -1198,14 +1242,16 @@ ; X86-LABEL: test_mask_or_ps_rmkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmkz_256: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -1234,16 +1280,16 @@ ; X86-LABEL: test_mask_or_ps_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x56,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbk_256: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1256,14 +1302,16 @@ ; X86-LABEL: test_mask_or_ps_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x56,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbkz_256: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1286,16 +1334,16 @@ define <16 x float> @test_mask_or_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) { ; X86-LABEL: test_mask_or_ps_rrk_512: ; X86: # %bb.0: +; X86-NEXT: vorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrk_512: ; X64: # %bb.0: +; X64-NEXT: vorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -1304,14 +1352,16 @@ define <16 x float> @test_mask_or_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) { ; X86-LABEL: test_mask_or_ps_rrkz_512: ; X86: # %bb.0: +; X86-NEXT: vorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrkz_512: ; X64: # %bb.0: +; X64-NEXT: vorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -1337,16 +1387,16 @@ ; X86-LABEL: test_mask_or_ps_rmk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmk_512: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -1357,14 +1407,16 @@ ; X86-LABEL: test_mask_or_ps_rmkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmkz_512: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -1393,16 +1445,16 @@ ; X86-LABEL: test_mask_or_ps_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x56,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x56,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbk_512: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1415,14 +1467,16 @@ ; X86-LABEL: test_mask_or_ps_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x56,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbkz_512: ; X64: # %bb.0: +; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1445,16 +1499,16 @@ define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_xor_ps_rrk_128: ; X86: # %bb.0: +; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrk_128: ; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vblendmps %xmm0, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -1463,14 +1517,16 @@ define <4 x float> @test_mask_xor_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_xor_ps_rrkz_128: ; X86: # %bb.0: +; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrkz_128: ; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -1496,16 +1552,16 @@ ; X86-LABEL: test_mask_xor_ps_rmk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmk_128: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -1516,14 +1572,16 @@ ; X86-LABEL: test_mask_xor_ps_rmkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmkz_128: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -1552,16 +1610,16 @@ ; X86-LABEL: test_mask_xor_ps_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x57,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbk_128: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1574,14 +1632,16 @@ ; X86-LABEL: test_mask_xor_ps_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x57,0x00] +; X86-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbkz_128: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07] +; X64-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1604,16 +1664,16 @@ define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) { ; X86-LABEL: test_mask_xor_ps_rrk_256: ; X86: # %bb.0: +; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrk_256: ; X64: # %bb.0: +; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -1622,14 +1682,16 @@ define <8 x float> @test_mask_xor_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) { ; X86-LABEL: test_mask_xor_ps_rrkz_256: ; X86: # %bb.0: +; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0xc1] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrkz_256: ; X64: # %bb.0: +; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -1655,16 +1717,16 @@ ; X86-LABEL: test_mask_xor_ps_rmk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmk_256: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -1675,14 +1737,16 @@ ; X86-LABEL: test_mask_xor_ps_rmkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmkz_256: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -1711,16 +1775,16 @@ ; X86-LABEL: test_mask_xor_ps_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x57,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbk_256: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1733,14 +1797,16 @@ ; X86-LABEL: test_mask_xor_ps_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x57,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x00] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbkz_256: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1763,16 +1829,16 @@ define <16 x float> @test_mask_xor_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) { ; X86-LABEL: test_mask_xor_ps_rrk_512: ; X86: # %bb.0: +; X86-NEXT: vxorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrk_512: ; X64: # %bb.0: +; X64-NEXT: vxorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -1781,14 +1847,16 @@ define <16 x float> @test_mask_xor_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) { ; X86-LABEL: test_mask_xor_ps_rrkz_512: ; X86: # %bb.0: +; X86-NEXT: vxorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrkz_512: ; X64: # %bb.0: +; X64-NEXT: vxorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -1814,16 +1882,16 @@ ; X86-LABEL: test_mask_xor_ps_rmk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmk_512: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -1834,14 +1902,16 @@ ; X86-LABEL: test_mask_xor_ps_rmkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmkz_512: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -1870,16 +1940,16 @@ ; X86-LABEL: test_mask_xor_ps_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x57,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x57,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbk_512: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x57,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1892,14 +1962,16 @@ ; X86-LABEL: test_mask_xor_ps_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x57,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x00] +; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbkz_512: ; X64: # %bb.0: +; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x07] +; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 Index: test/CodeGen/X86/avx512f-vec-test-testn.ll =================================================================== --- test/CodeGen/X86/avx512f-vec-test-testn.ll +++ test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -37,7 +37,8 @@ define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: vzeroupper @@ -73,7 +74,8 @@ define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax @@ -125,7 +127,8 @@ define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: vzeroupper @@ -161,7 +164,8 @@ define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -1453,7 +1453,8 @@ define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-LABEL: test_mm_test_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -1470,16 +1471,18 @@ ; X86-LABEL: test_mm_mask_test_epi32_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_test_epi32_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq @@ -1498,7 +1501,8 @@ define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-LABEL: test_mm256_test_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestmd %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: vzeroupper @@ -1515,8 +1519,9 @@ ; X86-LABEL: test_mm256_mask_test_epi32_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} +; X86-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: vzeroupper @@ -1524,8 +1529,9 @@ ; ; X64-LABEL: test_mm256_mask_test_epi32_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: vzeroupper @@ -1632,7 +1638,8 @@ define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { ; CHECK-LABEL: test_mm_testn_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -1649,16 +1656,18 @@ ; X86-LABEL: test_mm_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_testn_epi32_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} +; X64-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq @@ -1677,7 +1686,8 @@ define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { ; CHECK-LABEL: test_mm256_testn_epi32_mask: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: vzeroupper @@ -1694,8 +1704,9 @@ ; X86-LABEL: test_mm256_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} +; X86-NEXT: vptestnmd %ymm0, %ymm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: vzeroupper @@ -1703,8 +1714,9 @@ ; ; X64-LABEL: test_mm256_mask_testn_epi32_mask: ; X64: # %bb.0: # %entry +; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} +; X64-NEXT: vptestnmd %ymm0, %ymm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: vzeroupper Index: test/CodeGen/X86/avx512vl-logic.ll =================================================================== --- test/CodeGen/X86/avx512vl-logic.ll +++ test/CodeGen/X86/avx512vl-logic.ll @@ -222,14 +222,16 @@ define <4 x double> @test_mm256_mask_andnot_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_mask_andnot_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnpd %ymm2, %ymm1, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnq %ymm2, %ymm1, %ymm0 {%k1} +; KNL-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_andnot_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnpd %ymm2, %ymm1, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -246,14 +248,16 @@ define <4 x double> @test_mm256_maskz_andnot_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_maskz_andnot_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_andnot_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -270,14 +274,16 @@ define <2 x double> @test_mm_mask_andnot_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_mask_andnot_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnpd %xmm2, %xmm1, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnq %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_andnot_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnpd %xmm2, %xmm1, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -294,14 +300,16 @@ define <2 x double> @test_mm_maskz_andnot_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_maskz_andnot_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_andnot_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnpd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -318,14 +326,16 @@ define <8 x float> @test_mm256_mask_andnot_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_mask_andnot_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnps %ymm2, %ymm1, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnd %ymm2, %ymm1, %ymm0 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_andnot_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnps %ymm2, %ymm1, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -341,14 +351,16 @@ define <8 x float> @test_mm256_maskz_andnot_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_maskz_andnot_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnps %ymm1, %ymm0, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_andnot_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnps %ymm1, %ymm0, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -364,14 +376,16 @@ define <4 x float> @test_mm_mask_andnot_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_mask_andnot_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnps %xmm2, %xmm1, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnd %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_andnot_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnps %xmm2, %xmm1, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -388,14 +402,16 @@ define <4 x float> @test_mm_maskz_andnot_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_maskz_andnot_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_andnot_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -412,14 +428,16 @@ define <4 x double> @test_mm256_mask_and_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_mask_and_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandpd %ymm1, %ymm2, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandq %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_and_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandpd %ymm1, %ymm2, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -435,14 +453,16 @@ define <4 x double> @test_mm256_maskz_and_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_maskz_and_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandpd %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandq %ymm0, %ymm1, %ymm0 {%k1} {z} +; KNL-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_and_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandpd %ymm0, %ymm1, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -458,14 +478,16 @@ define <2 x double> @test_mm_mask_and_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_mask_and_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandpd %xmm1, %xmm2, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandq %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_and_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandpd %xmm1, %xmm2, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -481,14 +503,16 @@ define <2 x double> @test_mm_maskz_and_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_maskz_and_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandpd %xmm0, %xmm1, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandq %xmm0, %xmm1, %xmm0 {%k1} {z} +; KNL-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_and_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandpd %xmm0, %xmm1, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -504,14 +528,16 @@ define <8 x float> @test_mm256_mask_and_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_mask_and_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandps %ymm1, %ymm2, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_and_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandps %ymm1, %ymm2, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -526,14 +552,16 @@ define <8 x float> @test_mm256_maskz_and_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_maskz_and_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandps %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %ymm0, %ymm1, %ymm0 {%k1} {z} +; KNL-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_and_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandps %ymm0, %ymm1, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -548,14 +576,16 @@ define <4 x float> @test_mm_mask_and_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_mask_and_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandps %xmm1, %xmm2, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_and_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandps %xmm1, %xmm2, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -571,14 +601,16 @@ define <4 x float> @test_mm_maskz_and_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_maskz_and_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vandps %xmm0, %xmm1, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpandd %xmm0, %xmm1, %xmm0 {%k1} {z} +; KNL-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_and_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vandps %xmm0, %xmm1, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -594,14 +626,16 @@ define <4 x double> @test_mm256_mask_xor_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_mask_xor_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxorq %ymm2, %ymm1, %ymm0 {%k1} +; KNL-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_xor_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -617,14 +651,16 @@ define <4 x double> @test_mm256_maskz_xor_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_maskz_xor_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_xor_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -640,14 +676,16 @@ define <2 x double> @test_mm_mask_xor_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_mask_xor_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorpd %xmm2, %xmm1, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxorq %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_xor_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorpd %xmm2, %xmm1, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -663,14 +701,16 @@ define <2 x double> @test_mm_maskz_xor_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_maskz_xor_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_xor_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -686,14 +726,16 @@ define <8 x float> @test_mm256_mask_xor_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_mask_xor_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %ymm2, %ymm1, %ymm0 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_xor_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -708,14 +750,16 @@ define <8 x float> @test_mm256_maskz_xor_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_maskz_xor_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_xor_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -730,14 +774,16 @@ define <4 x float> @test_mm_mask_xor_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_mask_xor_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorps %xmm2, %xmm1, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_xor_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorps %xmm2, %xmm1, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -753,14 +799,16 @@ define <4 x float> @test_mm_maskz_xor_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_maskz_xor_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_xor_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -776,14 +824,16 @@ define <4 x double> @test_mm256_mask_or_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_mask_or_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorpd %ymm1, %ymm2, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vporq %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_or_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorpd %ymm1, %ymm2, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -799,14 +849,16 @@ define <4 x double> @test_mm256_maskz_or_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { ; KNL-LABEL: test_mm256_maskz_or_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorpd %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vporq %ymm0, %ymm1, %ymm0 {%k1} {z} +; KNL-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_or_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorpd %ymm0, %ymm1, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -822,14 +874,16 @@ define <2 x double> @test_mm_mask_or_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_mask_or_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorpd %xmm1, %xmm2, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vporq %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_or_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorpd %xmm1, %xmm2, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -845,14 +899,16 @@ define <2 x double> @test_mm_maskz_or_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { ; KNL-LABEL: test_mm_maskz_or_pd: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vporq %xmm0, %xmm1, %xmm0 {%k1} {z} +; KNL-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_or_pd: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -868,14 +924,16 @@ define <8 x float> @test_mm256_mask_or_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_mask_or_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorps %ymm1, %ymm2, %ymm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_mask_or_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorps %ymm1, %ymm2, %ymm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -890,14 +948,16 @@ define <8 x float> @test_mm256_maskz_or_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { ; KNL-LABEL: test_mm256_maskz_or_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorps %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %ymm0, %ymm1, %ymm0 {%k1} {z} +; KNL-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm256_maskz_or_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorps %ymm0, %ymm1, %ymm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -912,14 +972,16 @@ define <4 x float> @test_mm_mask_or_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_mask_or_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorps %xmm1, %xmm2, %xmm1 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_mask_or_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorps %xmm1, %xmm2, %xmm1 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -935,14 +997,16 @@ define <4 x float> @test_mm_maskz_or_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { ; KNL-LABEL: test_mm_maskz_or_ps: ; KNL: ## %bb.0: ## %entry +; KNL-NEXT: vorps %xmm0, %xmm1, %xmm0 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vpord %xmm0, %xmm1, %xmm0 {%k1} {z} +; KNL-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: test_mm_maskz_or_ps: ; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vorps %xmm0, %xmm1, %xmm0 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> Index: test/CodeGen/X86/avx512vl-vec-test-testn.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-test-testn.ll +++ test/CodeGen/X86/avx512vl-vec-test-testn.ll @@ -29,14 +29,16 @@ define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_test_epi32_mask: ; X86_64: # %bb.0: # %entry -; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 +; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86_64-NEXT: vptestmd %xmm0, %xmm0, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_test_epi32_mask: ; I386: # %bb.0: # %entry -; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 +; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 +; I386-NEXT: vptestmd %xmm0, %xmm0, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: def $al killed $al killed $eax ; I386-NEXT: retl @@ -78,7 +80,8 @@ define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_test_epi32_mask: ; X86_64: # %bb.0: # %entry -; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 +; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86_64-NEXT: vptestmd %ymm0, %ymm0, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: vzeroupper @@ -86,7 +89,8 @@ ; ; I386-LABEL: TEST_mm256_test_epi32_mask: ; I386: # %bb.0: # %entry -; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 +; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 +; I386-NEXT: vptestmd %ymm0, %ymm0, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: def $al killed $al killed $eax ; I386-NEXT: vzeroupper @@ -132,17 +136,19 @@ define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_mask_test_epi32_mask: ; X86_64: # %bb.0: # %entry +; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} +; X86_64-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_mask_test_epi32_mask: ; I386: # %bb.0: # %entry +; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} +; I386-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: def $al killed $al killed $eax ; I386-NEXT: retl @@ -194,7 +200,8 @@ define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask: ; X86_64: # %bb.0: # %entry -; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 +; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86_64-NEXT: vptestmd %ymm0, %ymm0, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: andb %dil, %al ; X86_64-NEXT: # kill: def $al killed $al killed $eax @@ -203,7 +210,8 @@ ; ; I386-LABEL: TEST_mm256_mask_test_epi32_mask: ; I386: # %bb.0: # %entry -; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 +; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 +; I386-NEXT: vptestmd %ymm0, %ymm0, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: andb {{[0-9]+}}(%esp), %al ; I386-NEXT: # kill: def $al killed $al killed $eax @@ -246,14 +254,16 @@ define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_testn_epi32_mask: ; X86_64: # %bb.0: # %entry -; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 +; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86_64-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_testn_epi32_mask: ; I386: # %bb.0: # %entry -; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 +; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 +; I386-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: def $al killed $al killed $eax ; I386-NEXT: retl @@ -295,7 +305,8 @@ define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_testn_epi32_mask: ; X86_64: # %bb.0: # %entry -; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86_64-NEXT: vptestnmd %ymm0, %ymm0, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: vzeroupper @@ -303,7 +314,8 @@ ; ; I386-LABEL: TEST_mm256_testn_epi32_mask: ; I386: # %bb.0: # %entry -; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 +; I386-NEXT: vptestnmd %ymm0, %ymm0, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: def $al killed $al killed $eax ; I386-NEXT: vzeroupper @@ -349,17 +361,19 @@ define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask: ; X86_64: # %bb.0: # %entry +; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} +; X86_64-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_mask_testn_epi32_mask: ; I386: # %bb.0: # %entry +; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} +; I386-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: def $al killed $al killed $eax ; I386-NEXT: retl @@ -411,7 +425,8 @@ define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask: ; X86_64: # %bb.0: # %entry -; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 +; X86_64-NEXT: vptestnmd %ymm0, %ymm0, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: andb %dil, %al ; X86_64-NEXT: # kill: def $al killed $al killed $eax @@ -420,7 +435,8 @@ ; ; I386-LABEL: TEST_mm256_mask_testn_epi32_mask: ; I386: # %bb.0: # %entry -; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 +; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 +; I386-NEXT: vptestnmd %ymm0, %ymm0, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: andb {{[0-9]+}}(%esp), %al ; I386-NEXT: # kill: def $al killed $al killed $eax Index: test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -673,7 +673,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 Index: test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -863,7 +863,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -708,7 +708,6 @@ ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 ; AVX2-NEXT: orq %rcx, %rax Index: test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll =================================================================== --- test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -359,7 +359,8 @@ ; AVX-LABEL: f64i8_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -367,7 +368,7 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -391,7 +392,8 @@ ; AVX-64-LABEL: f64i8_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -399,7 +401,7 @@ ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -429,8 +431,8 @@ ; AVX-LABEL: f64xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -438,7 +440,7 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -462,8 +464,8 @@ ; AVX-64-LABEL: f64xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] -; AVX-64-NEXT: # xmm3 = mem[0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -471,7 +473,7 @@ ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -509,7 +511,7 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -543,7 +545,7 @@ ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -844,7 +846,8 @@ ; AVX-LABEL: f32xi16_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -852,7 +855,7 @@ ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -876,7 +879,8 @@ ; AVX-64-LABEL: f32xi16_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -884,7 +888,7 @@ ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -914,8 +918,8 @@ ; AVX-LABEL: f32xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -923,7 +927,7 @@ ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -947,8 +951,8 @@ ; AVX-64-LABEL: f32xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309] -; AVX-64-NEXT: # xmm3 = mem[0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -956,7 +960,7 @@ ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -994,7 +998,7 @@ ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1028,7 +1032,7 @@ ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1256,8 +1260,8 @@ ; AVX-LABEL: f16xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1265,7 +1269,7 @@ ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1283,14 +1287,14 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314] -; AVX-64-NEXT: # xmm3 = mem[0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1298,7 +1302,7 @@ ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1316,7 +1320,7 @@ ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 @@ -1336,7 +1340,7 @@ ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1356,7 +1360,7 @@ ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i128: @@ -1370,7 +1374,7 @@ ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1390,7 +1394,7 @@ ; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 Index: test/CodeGen/X86/fma-fneg-combine.ll =================================================================== --- test/CodeGen/X86/fma-fneg-combine.ll +++ test/CodeGen/X86/fma-fneg-combine.ll @@ -8,7 +8,7 @@ define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; CHECK-NEXT: retq entry: %sub.i = fsub <16 x float> , %c @@ -24,7 +24,7 @@ define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i32 4) #2 @@ -35,7 +35,7 @@ define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2 @@ -46,7 +46,7 @@ define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2 @@ -105,7 +105,7 @@ define <8 x double> @test9(<8 x double> %a, <8 x double> %b, <8 x double> %c) { ; CHECK-LABEL: test9: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; CHECK-NEXT: retq entry: %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4) #2 @@ -118,7 +118,7 @@ define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test10: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq entry: @@ -160,13 +160,13 @@ ; SKX-LABEL: test11b: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; SKX-NEXT: retq ; ; KNL-LABEL: test11b: ; KNL: # %bb.0: # %entry ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; KNL-NEXT: retq entry: %sub.i = fsub <4 x float> , %c @@ -180,14 +180,14 @@ ; SKX-LABEL: test12: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 ; SKX-NEXT: vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; SKX-NEXT: retq ; ; KNL-LABEL: test12: ; KNL: # %bb.0: # %entry ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 ; KNL-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; KNL-NEXT: retq entry: @@ -297,13 +297,13 @@ ; SKX-LABEL: test17: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 ; SKX-NEXT: retq ; ; KNL-LABEL: test17: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 ; KNL-NEXT: retq %sub.i = fsub <8 x double> , %c %res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %sub.i, i32 4) @@ -317,13 +317,13 @@ ; SKX-LABEL: test18: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; SKX-NEXT: retq ; ; KNL-LABEL: test18: ; KNL: # %bb.0: # %entry ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 ; KNL-NEXT: retq entry: %sub.i = fsub <4 x float> , %b @@ -335,13 +335,13 @@ ; SKX-LABEL: test19: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; SKX-NEXT: retq ; ; KNL-LABEL: test19: ; KNL: # %bb.0: # %entry ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; KNL-NEXT: retq entry: %sub.i = fsub <4 x float> , %b @@ -354,14 +354,14 @@ ; SKX-LABEL: test20: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 ; SKX-NEXT: vmovaps %xmm2, %xmm0 ; SKX-NEXT: retq ; ; KNL-LABEL: test20: ; KNL: # %bb.0: # %entry ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 ; KNL-NEXT: vmovaps %xmm2, %xmm0 ; KNL-NEXT: retq entry: Index: test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-umax.ll +++ test/CodeGen/X86/horizontal-reduce-umax.ll @@ -409,40 +409,17 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1-LABEL: test_reduce_v16i8: -; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax -; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_reduce_v16i8: -; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax -; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: test_reduce_v16i8: -; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax -; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX512-NEXT: retq +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %2 = icmp ugt <16 x i8> %a0, %1 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 @@ -1104,11 +1081,12 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1986,11 +1964,12 @@ ; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper Index: test/CodeGen/X86/movmsk-cmp.ll =================================================================== --- test/CodeGen/X86/movmsk-cmp.ll +++ test/CodeGen/X86/movmsk-cmp.ll @@ -1809,8 +1809,7 @@ ; ; SKX-LABEL: allones_v4i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al @@ -1842,8 +1841,7 @@ ; ; SKX-LABEL: allzeros_v4i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testb $15, %al ; SKX-NEXT: sete %al @@ -1893,8 +1891,7 @@ ; ; SKX-LABEL: allones_v8i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -1944,8 +1941,7 @@ ; ; SKX-LABEL: allzeros_v8i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -2022,8 +2018,7 @@ ; ; SKX-LABEL: allones_v16i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -2100,8 +2095,7 @@ ; ; SKX-LABEL: allzeros_v16i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -3141,8 +3135,7 @@ ; ; SKX-LABEL: allones_v4i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al @@ -3174,8 +3167,7 @@ ; ; SKX-LABEL: allzeros_v4i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testb $15, %al ; SKX-NEXT: sete %al @@ -3225,8 +3217,7 @@ ; ; SKX-LABEL: allones_v8i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -3276,8 +3267,7 @@ ; ; SKX-LABEL: allzeros_v8i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -3354,8 +3344,7 @@ ; ; SKX-LABEL: allones_v16i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -3432,8 +3421,7 @@ ; ; SKX-LABEL: allzeros_v16i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper Index: test/CodeGen/X86/nontemporal-loads.ll =================================================================== --- test/CodeGen/X86/nontemporal-loads.ll +++ test/CodeGen/X86/nontemporal-loads.ll @@ -1800,23 +1800,35 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; SSE2-LABEL: test_masked_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: pxor %xmm0, %xmm11 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pandn (%rdi), %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn 16(%rdi), %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm5, %xmm11 ; SSE2-NEXT: pandn 32(%rdi), %xmm6 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm9 +; SSE2-NEXT: por %xmm6, %xmm9 ; SSE2-NEXT: pandn 48(%rdi), %xmm7 -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_masked_v16i32: Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -531,18 +531,16 @@ ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pmaxud %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm6, %xmm7 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm0 ; SSE41-NEXT: psubd %xmm2, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm4 -; SSE41-NEXT: pshufb %xmm6, %xmm4 -; SSE41-NEXT: pshufb %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE41-NEXT: pandn %xmm4, %xmm0 ; SSE41-NEXT: retq @@ -916,18 +914,16 @@ ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pminud %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm6, %xmm7 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq @@ -1052,18 +1048,16 @@ ; SSE41-NEXT: pcmpeqd %xmm1, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: pmaxud %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm6, %xmm7 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq Index: test/CodeGen/X86/sat-add.ll =================================================================== --- test/CodeGen/X86/sat-add.ll +++ test/CodeGen/X86/sat-add.ll @@ -746,15 +746,16 @@ ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_min: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -79,12 +79,15 @@ ; ; AVX1-LABEL: test_mm_andnot_ps: ; AVX1: # %bb.0: -; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_andnot_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -272,17 +272,22 @@ define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; SSE-LABEL: test_mm_andnot_pd: ; SSE: # %bb.0: -; SSE-NEXT: andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1] +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2] +; SSE-NEXT: pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2] +; SSE-NEXT: pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_andnot_pd: ; AVX1: # %bb.0: -; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_andnot_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x double> %a0 to <4 x i32> %arg1 = bitcast <2 x double> %a1 to <4 x i32> Index: test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll =================================================================== --- test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -132,9 +132,9 @@ ; ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm2 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 @@ -142,9 +142,9 @@ ; ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpxor (%rdi), %xmm1, %xmm2 +; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2 ; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq Index: test/CodeGen/X86/vec-copysign-avx512.ll =================================================================== --- test/CodeGen/X86/vec-copysign-avx512.ll +++ test/CodeGen/X86/vec-copysign-avx512.ll @@ -43,7 +43,7 @@ ; AVX512VL: ## %bb.0: ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512VLDQ-LABEL: v16f32: Index: test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- test/CodeGen/X86/vector-bitreverse.ll +++ test/CodeGen/X86/vector-bitreverse.ll @@ -2046,27 +2046,27 @@ ; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 ; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v16i32: Index: test/CodeGen/X86/vector-lzcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-512.ll +++ test/CodeGen/X86/vector-lzcnt-512.ll @@ -29,11 +29,11 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 @@ -106,11 +106,11 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 @@ -172,20 +172,20 @@ ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 @@ -201,15 +201,15 @@ ; AVX512DQ-LABEL: testv16i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -257,20 +257,20 @@ ; AVX512BW-LABEL: testv16i32u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 @@ -286,15 +286,15 @@ ; AVX512DQ-LABEL: testv16i32u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] Index: test/CodeGen/X86/vector-reduce-and.ll =================================================================== --- test/CodeGen/X86/vector-reduce-and.ll +++ test/CodeGen/X86/vector-reduce-and.ll @@ -309,13 +309,13 @@ ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -372,15 +372,15 @@ ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/vector-reduce-or.ll =================================================================== --- test/CodeGen/X86/vector-reduce-or.ll +++ test/CodeGen/X86/vector-reduce-or.ll @@ -309,13 +309,13 @@ ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -372,15 +372,15 @@ ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- test/CodeGen/X86/vector-reduce-umax.ll +++ test/CodeGen/X86/vector-reduce-umax.ll @@ -1583,11 +1583,12 @@ ; ; AVX512VL-LABEL: test_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm2 +; AVX512VL-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: retq @@ -1675,11 +1676,12 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm2 +; AVX512VL-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper @@ -1780,11 +1782,12 @@ ; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm2 +; AVX512VL-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper @@ -1903,11 +1906,12 @@ ; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm2 +; AVX512VL-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper Index: test/CodeGen/X86/vector-reduce-xor.ll =================================================================== --- test/CodeGen/X86/vector-reduce-xor.ll +++ test/CodeGen/X86/vector-reduce-xor.ll @@ -309,13 +309,13 @@ ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -372,15 +372,15 @@ ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/vector-rotate-512.ll =================================================================== --- test/CodeGen/X86/vector-rotate-512.ll +++ test/CodeGen/X86/vector-rotate-512.ll @@ -876,7 +876,7 @@ ; AVX512-LABEL: splatconstant_rotate_mask_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vprold $4, %zmm0, %zmm0 -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <16 x i32> %a, %lshr = lshr <16 x i32> %a, @@ -980,10 +980,8 @@ ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -991,10 +989,8 @@ ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -3505,7 +3505,7 @@ ; ; AVX512-LABEL: trunc_and_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4309,7 +4309,7 @@ ; ; AVX512-LABEL: trunc_xor_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -5113,7 +5113,7 @@ ; ; AVX512-LABEL: trunc_or_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/vector-tzcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-512.ll +++ test/CodeGen/X86/vector-tzcnt-512.ll @@ -128,7 +128,7 @@ ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -138,7 +138,7 @@ ; AVX512CDBW: # %bb.0: ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -148,7 +148,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -169,7 +169,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -177,7 +177,7 @@ ; BITALG: # %bb.0: ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] @@ -195,7 +195,7 @@ ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -205,7 +205,7 @@ ; AVX512CDBW: # %bb.0: ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -215,7 +215,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -236,7 +236,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -244,7 +244,7 @@ ; BITALG: # %bb.0: ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]