Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5639,11 +5639,12 @@ (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. - // TODO: The code below fires only for for loading the low 64-bits of a - // of a 128-bit vector. It's probably worth generalizing more. if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && - (VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64))) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64)) || + (VT.is256BitVector() && TLI.isTypeLegal(MVT::v4i64)) || + (VT.is512BitVector() && TLI.isTypeLegal(MVT::v8i64)))) { + MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, @@ -13659,7 +13660,7 @@ DAG.getConstant(ShiftInx, DL, ExtVT)); ShiftNode = DAG.getBitcast(InVT, ShiftNode); return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); - } + } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -31,9 +31,9 @@ RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); // The mask VT. - ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", - "v" # NumElts # "i1")); - + ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", + "v" # NumElts # "i1")); + // The GPR register class that can hold the write mask. Use GR8 for fewer // than 8 elements. Use shift-right and equal to work around the lack of // !lt in tablegen. @@ -998,11 +998,11 @@ multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { - let mayLoad = 1 in - defm rm : AVX512_maskable, + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, AVX5128IBase, EVEX; } @@ -1118,7 +1118,7 @@ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; } -multiclass avx512_mask_broadcast opc, string OpcodeStr, +multiclass avx512_mask_broadcast opc, string OpcodeStr, AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { let Predicates = [HasCDI] in defm Z : avx512_mask_broadcastm, EVEX_V512; @@ -1904,7 +1904,7 @@ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; let mayLoad = 1, AddedComplexity = 20 in { @@ -1919,7 +1919,7 @@ (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2))))], NoItinerary>, EVEX_K; } @@ -1940,7 +1940,7 @@ (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; let mayLoad = 1 in { @@ -1948,14 +1948,14 @@ (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##mem# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set _.KRC:$dst,(OpNode + [(set _.KRC:$dst,(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2)))], NoItinerary>; def rmk : AVX512, EVEX_K; def rmb : AVX512,EVEX_B; def rmbk : AVX512, EVEX_B, EVEX_K; @@ -1981,10 +1981,10 @@ } multiclass avx512_vector_fpclass_all opc, SDNode OpNode, Predicate prd, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, string broadcast>{ let Predicates = [prd] in { - defm Z : avx512_vector_fpclass, EVEX_V512; } let Predicates = [prd, HasVLX] in { @@ -1997,9 +1997,9 @@ multiclass avx512_fp_fpclass_all opcVec, bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ - defm PS : avx512_vector_fpclass_all, EVEX_CD8<32, CD8VF>; - defm PD : avx512_vector_fpclass_all,EVEX_CD8<64, CD8VF> , VEX_W; defm SS : avx512_scalar_fpclass, EVEX_CD8<32, CD8VT1>; @@ -2113,12 +2113,12 @@ def : Pat<(store VK4:$src, addr:$dst), (MOV8mr addr:$dst, (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)), - sub_8bit))>; + sub_8bit))>; def : Pat<(store VK8:$src, addr:$dst), (MOV8mr addr:$dst, (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), sub_8bit))>; - + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), @@ -2596,7 +2596,7 @@ def rrkz : AVX512PI, @@ -2919,24 +2919,24 @@ // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { - defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), - asm, "$src2, $src1","$src1, $src2", + asm, "$src2, $src1","$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), IIC_SSE_MOV_S_RR>, EVEX_4V; let Constraints = "$src1 = $dst" , mayLoad = 1 in defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, - (outs _.RC:$dst), + (outs _.RC:$dst), (ins _.ScalarMemOp:$src), asm,"$src","$src", - (_.VT (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src)))))>, EVEX; let isCodeGenOnly = 1 in { - def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.FRC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, @@ -2953,7 +2953,7 @@ !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX; - def mrk: AVX512PI<0x11, MRMDestMem, (outs), + def mrk: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; @@ -3175,6 +3175,12 @@ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; + + // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. + def : Pat<(v8i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; } def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), @@ -3429,7 +3435,7 @@ defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, SSE_INTALU_ITINS_P, HasBWI, 1>; -multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, +multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, SDNode OpNode, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in @@ -3439,11 +3445,11 @@ EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; let Predicates = [HasVLX, prd] in { defm NAME#Z256 : avx512_binop_rm2, EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; defm NAME#Z128 : avx512_binop_rm2, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } @@ -3452,7 +3458,7 @@ defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, avx512vl_i32_info, avx512vl_i64_info, X86pmuldq, HasAVX512, 1>,T8PD; -defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, +defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, avx512vl_i32_info, avx512vl_i64_info, X86pmuludq, HasAVX512, 1>; defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P, @@ -3875,15 +3881,15 @@ EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } -// Use 512bit version to implement 128/256 bit in case NoVLX. +// Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_vptest_lowering { def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), (_.KVT (COPY_TO_REGCLASS (!cast(NAME # Suffix # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src2, _.SubRegIdx)), _.KRC))>; } @@ -3903,7 +3909,7 @@ let Predicates = [HasAVX512, NoVLX] in { defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>; defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>; - } + } } multiclass avx512_vptest_dq opc, string OpcodeStr, SDNode OpNode> { @@ -3932,13 +3938,13 @@ defm BZ128: avx512_vptest, EVEX_V128; } - + let Predicates = [HasAVX512, NoVLX] in { defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">; defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">; defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">; defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">; - } + } } @@ -4136,20 +4142,20 @@ avx512vl_i64_info>, VEX_W; } -// Use 512bit version to implement 128/256 bit in case NoVLX. +// Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_var_shift_w_lowering { let Predicates = [HasBWI, NoVLX] in { - def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), (_.info256.VT _.info256.RC:$src2))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (!cast(NAME#"WZrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), (_.info128.VT _.info128.RC:$src2))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (!cast(NAME#"WZrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), @@ -4247,7 +4253,7 @@ X86VPermi, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; //===----------------------------------------------------------------------===// -// AVX-512 - VPERMIL +// AVX-512 - VPERMIL //===----------------------------------------------------------------------===// multiclass avx512_permil_vec OpcVar, string OpcodeStr, SDNode OpNode, @@ -4932,7 +4938,7 @@ //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, +multiclass avx512_cvt_s_int_round opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, Operand memop, ComplexPattern mem_cpat, string asm> { let hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -4940,23 +4946,23 @@ !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; def rb : SI, + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, EVEX, VEX_LIG, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : SI, EVEX, VEX_LIG; - } // hasSideEffects = 0, Predicates = [HasAVX512] + } // hasSideEffects = 0, Predicates = [HasAVX512] } // Convert float/double to signed/unsigned int 32/64 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, ssmem, sse_load_f32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; @@ -4967,11 +4973,11 @@ defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, sdmem, sse_load_f64, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; @@ -5000,8 +5006,8 @@ } // isCodeGenOnly = 1, Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation -multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, - X86VectorVTInfo _DstRC, SDNode OpNode, +multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, SDNode OpNodeRnd>{ let Predicates = [HasAVX512] in { def rr : SI, EVEX, EVEX_B; def rm : SI, + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, EVEX; let isCodeGenOnly = 1,hasSideEffects = 0 in { @@ -5022,11 +5028,11 @@ (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; def rb_Int : SI, + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, EVEX,VEX_LIG , EVEX_B; let mayLoad = 1 in - def rm_Int : SI, EVEX, VEX_LIG; @@ -5036,30 +5042,30 @@ } -defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, - fp_to_sint,X86cvttss2IntRnd>, +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, - fp_to_sint,X86cvttss2IntRnd>, +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, VEX_W, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, fp_to_sint,X86cvttsd2IntRnd>, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, - fp_to_sint,X86cvttsd2IntRnd>, +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, VEX_W, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, - fp_to_uint,X86cvttss2UIntRnd>, +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, - fp_to_uint,X86cvttss2UIntRnd>, +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, XS,VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, - fp_to_uint,X86cvttsd2UIntRnd>, +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, - fp_to_uint,X86cvttsd2UIntRnd>, +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), @@ -5078,17 +5084,17 @@ multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode> { defm rr : AVX512_maskable_scalar, + (_Src.VT _Src.RC:$src2)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; defm rm : AVX512_maskable_scalar, + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -5098,7 +5104,7 @@ defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, EVEX_B; @@ -5110,13 +5116,13 @@ defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_B, EVEX_RC; } -multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86VectorVTInfo _src, +multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { defm Z : avx512_cvt_fp_scalar, @@ -5126,22 +5132,22 @@ } } -multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86VectorVTInfo _src, +multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { defm Z : avx512_cvt_fp_scalar, - avx512_cvt_fp_sae_scalar, + avx512_cvt_fp_sae_scalar, EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; } } defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, X86froundRnd, f64x_info, f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, X86fpextRnd,f32x_info, f64x_info >; -def : Pat<(f64 (fextend FR32X:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; def : Pat<(f64 (fextend (loadf32 addr:$src))), @@ -5153,12 +5159,12 @@ Requires<[HasAVX512, OptForSize]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), - (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// @@ -5575,7 +5581,7 @@ //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps { defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", @@ -5583,7 +5589,7 @@ (i32 FROUND_CURRENT))>, T8PD; let hasSideEffects = 0, mayLoad = 1 in { defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), - "vcvtph2ps", "$src", "$src", + "vcvtph2ps", "$src", "$src", (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), (i32 FROUND_CURRENT))>, T8PD; } @@ -5599,43 +5605,43 @@ let Predicates = [HasAVX512] in { defm VCVTPH2PSZ : avx512_cvtph2ps, - avx512_cvtph2ps_sae, + avx512_cvtph2ps_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { - defm VCVTPH2PSZ256 : avx512_cvtph2ps,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } } -multiclass avx512_cvtps2ph { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, $src1", "$src1, $src2", + "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), + (i32 imm:$src2), (i32 FROUND_CURRENT))>, AVX512AIi8Base; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), (i32 FROUND_CURRENT) )), addr:$dst)]>; def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>, EVEX_K; } } multiclass avx512_cvtps2ph_sae { defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), + (i32 imm:$src2), (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; } let Predicates = [HasAVX512] in { @@ -5655,7 +5661,7 @@ string OpcodeStr> { def rb: AVX512, EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[WriteFAdd]>; @@ -6660,14 +6666,14 @@ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } -// Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass convert_vector_to_mask_lowering { def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))), (_.KVT (COPY_TO_REGCLASS (!cast(NAME#"Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), _.RC:$src, _.SubRegIdx)), _.KRC))>; } @@ -7449,29 +7455,29 @@ def rm : AVX512; } -multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, +multiclass avx512_shift_packed_all opc, SDNode OpNode, Format MRMr, Format MRMm, string OpcodeStr, Predicate prd>{ let Predicates = [prd] in - defm Z512 : avx512_shift_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_shift_packed, EVEX_V256; - defm Z128 : avx512_shift_packed, EVEX_V128; } } -defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", HasBWI>, AVX512PDIi8Base, EVEX_4V; -defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", HasBWI>, AVX512PDIi8Base, EVEX_4V; -multiclass avx512_psadbw_packed opc, SDNode OpNode, +multiclass avx512_psadbw_packed opc, SDNode OpNode, string OpcodeStr, X86VectorVTInfo _dst, X86VectorVTInfo _src>{ def rr : AVX512BI; } -multiclass avx512_psadbw_packed_all opc, SDNode OpNode, +multiclass avx512_psadbw_packed_all opc, SDNode OpNode, string OpcodeStr, Predicate prd> { let Predicates = [prd] in defm Z512 : avx512_psadbw_packed, EVEX_4V; multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, @@ -7592,7 +7598,7 @@ let Constraints = "$src1 = $dst" in { defm rrib : AVX512_maskable_3src, +defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; -defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f64x_info, v2i64x_info>, +defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; -defm VFIXUPIMMPS : avx512_fixupimm_packed_all, +defm VFIXUPIMMPS : avx512_fixupimm_packed_all, EVEX_CD8<32, CD8VF>; -defm VFIXUPIMMPD : avx512_fixupimm_packed_all, +defm VFIXUPIMMPD : avx512_fixupimm_packed_all, EVEX_CD8<64, CD8VF>, VEX_W; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -685,11 +685,6 @@ return cast(N)->getAlignment() >= 16; }]>; -// Like 'X86vzload', but always requires 128-bit vector alignment. -def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{ - return cast(N)->getAlignment() >= 16; -}]>; - // Like 'load', but always requires 256-bit vector alignment. def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 32; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -5058,6 +5058,8 @@ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; } let Predicates = [UseSSE2], AddedComplexity = 20 in { @@ -5066,13 +5068,6 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; } -let Predicates = [HasAVX] in { -def : Pat<(v4i64 (alignedX86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; -def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; -} - //===---------------------------------------------------------------------===// // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. Index: test/CodeGen/X86/merge-consecutive-loads-256.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-256.ll +++ test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -249,26 +249,10 @@ } define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_8f32_f32_12zzuuzz: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_8f32_f32_12zzuuzz: -; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: merge_8f32_f32_12zzuuzz: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq +; AVX-LABEL: merge_8f32_f32_12zzuuzz: +; AVX: # BB#0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 1 %ptr1 = getelementptr inbounds float, float* %ptr, i64 2 %val0 = load float, float* %ptr0 Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -187,10 +187,6 @@ ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; ALL: # BB#0: ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 8 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9 @@ -282,10 +278,6 @@ ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: ; ALL: # BB#0: ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2 @@ -383,8 +375,6 @@ ; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 @@ -454,18 +444,12 @@ ; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2