Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5601,6 +5601,24 @@ } } + auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) { + SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), + LDBase->getBasePtr(), LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), LDBase->getAlignment()); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; + }; + // LOAD - all consecutive load/undefs (must start/end with a load). // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. @@ -5616,23 +5634,7 @@ if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); - SDValue NewLd = SDValue(); - - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), LDBase->isVolatile(), - LDBase->isNonTemporal(), LDBase->isInvariant(), - LDBase->getAlignment()); - - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = - DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - } - - return NewLd; + return CreateLoad(VT, LDBase); } int LoadSize = @@ -5666,6 +5668,19 @@ return DAG.getBitcast(VT, ResNode); } + + // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. + if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && + ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v4f32)) || + (VT.is256BitVector() && TLI.isTypeLegal(MVT::v8f32)) || + (VT.is512BitVector() && TLI.isTypeLegal(MVT::v16f32)))) { + MVT VecVT = MVT::getVectorVT(MVT::f32, VT.getSizeInBits() / 32); + SDValue V = CreateLoad(MVT::f32, LDBase); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); + return DAG.getBitcast(VT, V); + } + return SDValue(); } @@ -13659,7 +13674,7 @@ DAG.getConstant(ShiftInx, DL, ExtVT)); ShiftNode = DAG.getBitcast(InVT, ShiftNode); return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); - } + } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -31,9 +31,9 @@ RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); // The mask VT. - ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", - "v" # NumElts # "i1")); - + ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", + "v" # NumElts # "i1")); + // The GPR register class that can hold the write mask. Use GR8 for fewer // than 8 elements. Use shift-right and equal to work around the lack of // !lt in tablegen. @@ -998,11 +998,11 @@ multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { - let mayLoad = 1 in - defm rm : AVX512_maskable, + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, AVX5128IBase, EVEX; } @@ -1118,7 +1118,7 @@ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; } -multiclass avx512_mask_broadcast opc, string OpcodeStr, +multiclass avx512_mask_broadcast opc, string OpcodeStr, AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { let Predicates = [HasCDI] in defm Z : avx512_mask_broadcastm, EVEX_V512; @@ -1904,7 +1904,7 @@ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; let mayLoad = 1, AddedComplexity = 20 in { @@ -1919,7 +1919,7 @@ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2))))], NoItinerary>, EVEX_K; } @@ -1940,7 +1940,7 @@ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; let mayLoad = 1 in { @@ -1948,14 +1948,14 @@ (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##mem# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set _.KRC:$dst,(OpNode + [(set _.KRC:$dst,(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2)))], NoItinerary>; def rmk : AVX512, EVEX_K; def rmb : AVX512,EVEX_B; def rmbk : AVX512, EVEX_B, EVEX_K; @@ -1981,10 +1981,10 @@ } multiclass avx512_vector_fpclass_all opc, SDNode OpNode, Predicate prd, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, string broadcast>{ let Predicates = [prd] in { - defm Z : avx512_vector_fpclass, EVEX_V512; } let Predicates = [prd, HasVLX] in { @@ -1997,9 +1997,9 @@ multiclass avx512_fp_fpclass_all opcVec, bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ - defm PS : avx512_vector_fpclass_all, EVEX_CD8<32, CD8VF>; - defm PD : avx512_vector_fpclass_all,EVEX_CD8<64, CD8VF> , VEX_W; defm SS : avx512_scalar_fpclass, EVEX_CD8<32, CD8VT1>; @@ -2113,12 +2113,12 @@ def : Pat<(store VK4:$src, addr:$dst), (MOV8mr addr:$dst, (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)), - sub_8bit))>; + sub_8bit))>; def : Pat<(store VK8:$src, addr:$dst), (MOV8mr addr:$dst, (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), sub_8bit))>; - + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), @@ -2596,7 +2596,7 @@ def rrkz : AVX512PI, @@ -2919,24 +2919,24 @@ // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { - defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), - asm, "$src2, $src1","$src1, $src2", + asm, "$src2, $src1","$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), IIC_SSE_MOV_S_RR>, EVEX_4V; let Constraints = "$src1 = $dst" , mayLoad = 1 in defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, - (outs _.RC:$dst), + (outs _.RC:$dst), (ins _.ScalarMemOp:$src), asm,"$src","$src", - (_.VT (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src)))))>, EVEX; let isCodeGenOnly = 1 in { - def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.FRC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, @@ -2953,7 +2953,7 @@ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX; - def mrk: AVX512PI<0x11, MRMDestMem, (outs), + def mrk: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; @@ -3010,7 +3010,7 @@ (SUBREG_TO_REG (i32 0), (VMOVSSZrr (v4i32 (V_SET0)), (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; - } +} let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this @@ -3046,6 +3046,18 @@ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + + // Represent the same patterns above but in the form they appear for + // 512-bit types + def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), @@ -3429,7 +3441,7 @@ defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, SSE_INTALU_ITINS_P, HasBWI, 1>; -multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, +multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, SDNode OpNode, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in @@ -3439,11 +3451,11 @@ EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; let Predicates = [HasVLX, prd] in { defm NAME#Z256 : avx512_binop_rm2, EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; defm NAME#Z128 : avx512_binop_rm2, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } @@ -3452,7 +3464,7 @@ defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, avx512vl_i32_info, avx512vl_i64_info, X86pmuldq, HasAVX512, 1>,T8PD; -defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, +defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, avx512vl_i32_info, avx512vl_i64_info, X86pmuludq, HasAVX512, 1>; defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P, @@ -3875,15 +3887,15 @@ EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } -// Use 512bit version to implement 128/256 bit in case NoVLX. +// Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_vptest_lowering { def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), (_.KVT (COPY_TO_REGCLASS (!cast(NAME # Suffix # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src2, _.SubRegIdx)), _.KRC))>; } @@ -3903,7 +3915,7 @@ let Predicates = [HasAVX512, NoVLX] in { defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>; defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>; - } + } } multiclass avx512_vptest_dq opc, string OpcodeStr, SDNode OpNode> { @@ -3932,13 +3944,13 @@ defm BZ128: avx512_vptest, EVEX_V128; } - + let Predicates = [HasAVX512, NoVLX] in { defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">; defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">; defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">; defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">; - } + } } @@ -4136,20 +4148,20 @@ avx512vl_i64_info>, VEX_W; } -// Use 512bit version to implement 128/256 bit in case NoVLX. +// Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_var_shift_w_lowering { let Predicates = [HasBWI, NoVLX] in { - def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), (_.info256.VT _.info256.RC:$src2))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (!cast(NAME#"WZrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), (_.info128.VT _.info128.RC:$src2))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (!cast(NAME#"WZrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), @@ -4247,7 +4259,7 @@ X86VPermi, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; //===----------------------------------------------------------------------===// -// AVX-512 - VPERMIL +// AVX-512 - VPERMIL //===----------------------------------------------------------------------===// multiclass avx512_permil_vec OpcVar, string OpcodeStr, SDNode OpNode, @@ -4932,7 +4944,7 @@ //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, +multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, Operand memop, ComplexPattern mem_cpat, string asm> { let hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -4940,23 +4952,23 @@ !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; def rb : SI, + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, EVEX, VEX_LIG, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : SI, EVEX, VEX_LIG; - } // hasSideEffects = 0, Predicates = [HasAVX512] + } // hasSideEffects = 0, Predicates = [HasAVX512] } // Convert float/double to signed/unsigned int 32/64 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, ssmem, sse_load_f32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; @@ -4967,11 +4979,11 @@ defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, sdmem, sse_load_f64, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; @@ -5000,8 +5012,8 @@ } // isCodeGenOnly = 1, Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation -multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, - X86VectorVTInfo _DstRC, SDNode OpNode, +multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, SDNode OpNodeRnd>{ let Predicates = [HasAVX512] in { def rr : SI, EVEX, EVEX_B; def rm : SI, + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, EVEX; let isCodeGenOnly = 1,hasSideEffects = 0 in { @@ -5022,11 +5034,11 @@ (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; def rb_Int : SI, + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, EVEX,VEX_LIG , EVEX_B; let mayLoad = 1 in - def rm_Int : SI, EVEX, VEX_LIG; @@ -5036,30 +5048,30 @@ } -defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, - fp_to_sint,X86cvttss2IntRnd>, +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, - fp_to_sint,X86cvttss2IntRnd>, +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, VEX_W, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, fp_to_sint,X86cvttsd2IntRnd>, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, - fp_to_sint,X86cvttsd2IntRnd>, +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, VEX_W, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, - fp_to_uint,X86cvttss2UIntRnd>, +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, - fp_to_uint,X86cvttss2UIntRnd>, +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, XS,VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, - fp_to_uint,X86cvttsd2UIntRnd>, +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, - fp_to_uint,X86cvttsd2UIntRnd>, +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), @@ -5078,17 +5090,17 @@ multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode> { defm rr : AVX512_maskable_scalar, + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; defm rm : AVX512_maskable_scalar, + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -5098,7 +5110,7 @@ defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, EVEX_B; @@ -5110,13 +5122,13 @@ defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_B, EVEX_RC; } -multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86VectorVTInfo _src, +multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { defm Z : avx512_cvt_fp_scalar, @@ -5126,22 +5138,22 @@ } } -multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86VectorVTInfo _src, +multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { defm Z : avx512_cvt_fp_scalar, - avx512_cvt_fp_sae_scalar, + avx512_cvt_fp_sae_scalar, EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; } } defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, X86froundRnd, f64x_info, f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, X86fpextRnd,f32x_info, f64x_info >; -def : Pat<(f64 (fextend FR32X:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; def : Pat<(f64 (fextend (loadf32 addr:$src))), @@ -5153,12 +5165,12 @@ Requires<[HasAVX512, OptForSize]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), - (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// @@ -5575,7 +5587,7 @@ //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps { defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", @@ -5583,7 +5595,7 @@ (i32 FROUND_CURRENT))>, T8PD; let hasSideEffects = 0, mayLoad = 1 in { defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), - "vcvtph2ps", "$src", "$src", + "vcvtph2ps", "$src", "$src", (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), (i32 FROUND_CURRENT))>, T8PD; } @@ -5599,43 +5611,43 @@ let Predicates = [HasAVX512] in { defm VCVTPH2PSZ : avx512_cvtph2ps, - avx512_cvtph2ps_sae, + avx512_cvtph2ps_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { - defm VCVTPH2PSZ256 : avx512_cvtph2ps,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } } -multiclass avx512_cvtps2ph { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, $src1", "$src1, $src2", + "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), + (i32 imm:$src2), (i32 FROUND_CURRENT))>, AVX512AIi8Base; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), (i32 FROUND_CURRENT) )), addr:$dst)]>; def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>, EVEX_K; } } multiclass avx512_cvtps2ph_sae { defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), + (i32 imm:$src2), (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; } let Predicates = [HasAVX512] in { @@ -5655,7 +5667,7 @@ string OpcodeStr> { def rb: AVX512, EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[WriteFAdd]>; @@ -6660,14 +6672,14 @@ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } -// Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass convert_vector_to_mask_lowering { def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))), (_.KVT (COPY_TO_REGCLASS (!cast(NAME#"Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src, _.SubRegIdx)), _.KRC))>; } @@ -7449,29 +7461,29 @@ def rm : AVX512; } -multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, +multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, Format MRMm, string OpcodeStr, Predicate prd>{ let Predicates = [prd] in - defm Z512 : avx512_shift_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_shift_packed, EVEX_V256; - defm Z128 : avx512_shift_packed, EVEX_V128; } } -defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", HasBWI>, AVX512PDIi8Base, EVEX_4V; -defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", HasBWI>, AVX512PDIi8Base, EVEX_4V; -multiclass avx512_psadbw_packed opc, SDNode OpNode, +multiclass avx512_psadbw_packed opc, SDNode OpNode, string OpcodeStr, X86VectorVTInfo _dst, X86VectorVTInfo _src>{ def rr : AVX512BI; } -multiclass avx512_psadbw_packed_all opc, SDNode OpNode, +multiclass avx512_psadbw_packed_all opc, SDNode OpNode, string OpcodeStr, Predicate prd> { let Predicates = [prd] in defm Z512 : avx512_psadbw_packed, EVEX_4V; multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, @@ -7592,7 +7604,7 @@ let Constraints = "$src1 = $dst" in { defm rrib : AVX512_maskable_3src, +defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; -defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f64x_info, v2i64x_info>, +defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; -defm VFIXUPIMMPS : avx512_fixupimm_packed_all, +defm VFIXUPIMMPS : avx512_fixupimm_packed_all, EVEX_CD8<32, CD8VF>; -defm VFIXUPIMMPD : avx512_fixupimm_packed_all, +defm VFIXUPIMMPD : avx512_fixupimm_packed_all, EVEX_CD8<64, CD8VF>, VEX_W; Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -327,14 +327,12 @@ define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_8i16_i16_34uuuuuu: ; SSE: # BB#0: -; SSE-NEXT: pinsrw $0, 6(%rdi), %xmm0 -; SSE-NEXT: pinsrw $1, 8(%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: merge_8i16_i16_34uuuuuu: ; AVX: # BB#0: -; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, 8(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4 @@ -427,33 +425,14 @@ } define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp { -; SSE2-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movzbl 1(%rdi), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $0, %ecx, %xmm0 -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: shll $8, %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: -; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 -; SSE41-NEXT: pinsrb $1, 1(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $3, 3(%rdi), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: retq ; ; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 Index: test/CodeGen/X86/merge-consecutive-loads-256.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-256.ll +++ test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -417,29 +417,10 @@ } define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 -; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq +; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: +; AVX: # BB#0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 8 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 9 %val0 = load i16, i16* %ptr0 @@ -547,9 +528,7 @@ define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; AVX: # BB#0: -; AVX-NEXT: vpinsrb $0, 4(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $1, 5(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, 7(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 4 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 5 @@ -564,32 +543,10 @@ } define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $0, 2(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpinsrb $1, 3(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrb $3, 5(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $0, 2(%rdi), %xmm0, %xmm1 -; AVX2-NEXT: vpinsrb $1, 3(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrb $3, 5(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $0, 2(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrb $1, 3(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrb $3, 5(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq +; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: +; AVX: # BB#0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 2 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 3 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 5 Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -420,21 +420,13 @@ define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { ; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrw $1, 6(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm1 -; AVX512BW-NEXT: vpinsrw $1, 6(%rdi), %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 @@ -490,23 +482,13 @@ define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { ; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $0, 1(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrb $1, 2(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrb $3, 4(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $0, 1(%rdi), %xmm0, %xmm1 -; AVX512BW-NEXT: vpinsrb $1, 2(%rdi), %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrb $3, 4(%rdi), %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2