Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6094,552 +6094,223 @@ // SSE4.1 - Packed Move with Sign/Zero Extend //===----------------------------------------------------------------------===// -multiclass SS41I_binop_rm_int8 opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { - def rr : SS48I opc, string OpcodeStr, X86MemOperand MemOp, + RegisterClass OutRC, RegisterClass InRC, + OpndItins itins> { + def rr : SS48I, + [], itins.rr>, Sched<[itins.Sched]>; - def rm : SS48I, Sched<[itins.Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int16_y opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I, Sched<[Sched]>; - - def Yrm : SS48I, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", - int_x86_sse41_pmovsxbw, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", - int_x86_sse41_pmovsxwd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", - int_x86_sse41_pmovsxdq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", - int_x86_sse41_pmovzxbw, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", - int_x86_sse41_pmovzxwd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", - int_x86_sse41_pmovzxdq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -} - -let Predicates = [HasAVX2] in { -defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", - int_x86_avx2_pmovsxbw, - WriteShuffle>, VEX, VEX_L; -defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", - int_x86_avx2_pmovsxwd, - WriteShuffle>, VEX, VEX_L; -defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", - int_x86_avx2_pmovsxdq, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", - int_x86_avx2_pmovzxbw, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", - int_x86_avx2_pmovzxwd, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", - int_x86_avx2_pmovzxdq, - WriteShuffle>, VEX, VEX_L; -} - -defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, - SSE_INTALU_ITINS_SHUFF_P>; - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load. - def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), - (VPMOVSXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), - (VPMOVSXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), - (VPMOVSXDQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), - (VPMOVZXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), - (VPMOVZXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), - (VPMOVZXDQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - // Common patterns involving scalar load. - def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), - (PMOVSXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), - (PMOVSXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), - (PMOVSXDQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), - (PMOVZXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), - (PMOVZXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), - (PMOVZXDQrm addr:$src)>; -} - -multiclass SS41I_binop_rm_int4 opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { - def rr : SS48I, - Sched<[itins.Sched]>; - - def rm : SS48I, Sched<[itins.Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int8_y opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I, Sched<[Sched]>; - - def Yrm : SS48I, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -} - -let Predicates = [HasAVX2] in { -defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", - int_x86_avx2_pmovsxbd, WriteShuffle>, - VEX, VEX_L; -defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", - int_x86_avx2_pmovsxwq, WriteShuffle>, - VEX, VEX_L; -defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", - int_x86_avx2_pmovzxbd, WriteShuffle>, - VEX, VEX_L; -defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", - int_x86_avx2_pmovzxwq, WriteShuffle>, - VEX, VEX_L; -} - -defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq, - SSE_INTALU_ITINS_SHUFF_P>; - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (VPMOVSXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (VPMOVSXWQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (VPMOVZXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (VPMOVZXWQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (PMOVSXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (PMOVSXWQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (PMOVZXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (PMOVZXWQrm addr:$src)>; -} - -multiclass SS41I_binop_rm_int2 opc, string OpcodeStr, Intrinsic IntId, - X86FoldableSchedWrite Sched> { - def rr : SS48I, Sched<[Sched]>; - - // Expecting a i16 load any extended to i32 value. - def rm : SS48I, - Sched<[Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int4_y opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I, Sched<[Sched]>; - - // Expecting a i16 load any extended to i32 value. - def Yrm : SS48I, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq, - WriteShuffle>, VEX; -defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq, - WriteShuffle>, VEX; -} -let Predicates = [HasAVX2] in { -defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq, - WriteShuffle>, VEX, VEX_L; -} -defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq, - WriteShuffle>; -defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq, - WriteShuffle>; - -let Predicates = [HasAVX2] in { - def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; - def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>; - - def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>; - - def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; - - def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))), - (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))), - (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))), - (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))), - (VPMOVSXWDYrm addr:$src)>; - def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))), - (VPMOVSXDQYrm addr:$src)>; - - def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXBDYrm addr:$src)>; - def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXBDYrm addr:$src)>; - - def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXWQYrm addr:$src)>; - def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXWQYrm addr:$src)>; - - def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQYrm addr:$src)>; -} - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; - - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXWQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (extloadi32i16 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXDQrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXBWrm addr:$src)>; + [], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +multiclass SS41I_pmovx_rm_all opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp, + OpndItins SSEItins, OpndItins AVXItins, + OpndItins AVX2Itins> { + defm NAME : SS41I_pmovx_rrrm; + let Predicates = [HasAVX] in + defm V#NAME : SS41I_pmovx_rrrm, VEX; + let Predicates = [HasAVX2] in + defm V#NAME#Y : SS41I_pmovx_rrrm, VEX, VEX_L; +} + +multiclass SS41I_pmovx_rm opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp> { + defm PMOVSX#NAME : SS41I_pmovx_rm_all; + defm PMOVZX#NAME : SS41I_pmovx_rm_all; +} + +defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>; +defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>; +defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>; + +defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>; +defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; + +defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns { + // Register-Register patterns + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BWYrr) VR128:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BQYrr) VR128:$src)>; + + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast(OpcPrefix#DQYrr) VR128:$src)>; + + // AVX2 Register-Memory patterns + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; } let Predicates = [HasAVX2] in { - def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>; - def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>; - - def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>; - - def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>; - - def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))), - (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))), - (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))), - (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; +} + +// SSE4.1/AVX patterns. +multiclass SS41I_pmovx_patterns { + def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BWrr) VR128:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BQrr) VR128:$src)>; + + def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WQrr) VR128:$src)>; + + def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), + (!cast(OpcPrefix#DQrr) VR128:$src)>; + + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BWrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + (!cast(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#WDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast(OpcPrefix#DQrm) addr:$src)>; } let Predicates = [HasAVX] in { - def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>; - - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXWQrm addr:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), - (VPMOVZXDQrm addr:$src)>; - - def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXBWrm addr:$src)>; - - def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXWQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (extloadi32i16 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; + defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; } let Predicates = [UseSSE41] in { - def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>; - - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXBWrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), - (PMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXWQrm addr:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), - (PMOVZXDQrm addr:$src)>; + defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; } //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -138,6 +138,18 @@ X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -274,6 +286,18 @@ X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, X86ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, X86ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), Index: test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll @@ -0,0 +1,110 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s + +define <16 x i16> @test_lvm_x86_avx2_pmovsxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_lvm_x86_avx2_pmovsxbw +; CHECK: vpmovsxbw (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbd +; CHECK: vpmovsxbd (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbq +; CHECK: vpmovsxbq (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %1) + ret <4 x i64> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwd +; CHECK: vpmovsxwd (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwq +; CHECK: vpmovsxwq (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %1) + ret <4 x i64> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxdq +; CHECK: vpmovsxdq (%rdi), %ymm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %1) + ret <4 x i64> %2 +} + +define <16 x i16> @test_lvm_x86_avx2_pmovzxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_lvm_x86_avx2_pmovzxbw +; CHECK: vpmovzxbw (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbd +; CHECK: vpmovzxbd (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbq +; CHECK: vpmovzxbq (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %1) + ret <4 x i64> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwd +; CHECK: vpmovzxwd (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwq +; CHECK: vpmovzxwq (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %1) + ret <4 x i64> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxdq +; CHECK: vpmovzxdq (%rdi), %ymm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %1) + ret <4 x i64> %2 +} + +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) Index: test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll @@ -0,0 +1,123 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX + +define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbw +; SSE41: pmovsxbw (%rdi), %xmm0 +; AVX: vpmovsxbw (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbd +; SSE41: pmovsxbd (%rdi), %xmm0 +; AVX: vpmovsxbd (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbq +; SSE41: pmovsxbq (%rdi), %xmm0 +; AVX: vpmovsxbq (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %1) + ret <2 x i64> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwd +; SSE41: pmovsxwd (%rdi), %xmm0 +; AVX: vpmovsxwd (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwq +; SSE41: pmovsxwq (%rdi), %xmm0 +; AVX: vpmovsxwq (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %1) + ret <2 x i64> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxdq +; SSE41: pmovsxdq (%rdi), %xmm0 +; AVX: vpmovsxdq (%rdi), %xmm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %1) + ret <2 x i64> %2 +} + +define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbw +; SSE41: pmovzxbw (%rdi), %xmm0 +; AVX: vpmovzxbw (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbd +; SSE41: pmovzxbd (%rdi), %xmm0 +; AVX: vpmovzxbd (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbq +; SSE41: pmovzxbq (%rdi), %xmm0 +; AVX: vpmovzxbq (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1) + ret <2 x i64> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwd +; SSE41: pmovzxwd (%rdi), %xmm0 +; AVX: vpmovzxwd (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwq +; SSE41: pmovzxwq (%rdi), %xmm0 +; AVX: vpmovzxwq (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1) + ret <2 x i64> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxdq +; SSE41: pmovzxdq (%rdi), %xmm0 +; AVX: vpmovzxdq (%rdi), %xmm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) +declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) +declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) +declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) +declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -567,8 +567,7 @@ ; ; AVX2-LABEL: sext_16i8_to_16i16: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; X32-SSE41-LABEL: sext_16i8_to_16i16: Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -204,3 +204,157 @@ %t = zext <16 x i8> %z to <16 x i16> ret <16 x i16> %t } + +define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: load_zext_16i8_to_16i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq + +; SSSE3-LABEL: load_zext_16i8_to_16i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: retq + +; SSE41-LABEL: load_zext_16i8_to_16i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: retq + +; AVX1-LABEL: load_zext_16i8_to_16i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + +; AVX2-LABEL: load_zext_16i8_to_16i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxbw (%rdi), %ymm0 +; AVX2-NEXT: retq +entry: + %X = load <16 x i8>* %ptr + %Y = zext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { +; SSE2-LABEL: load_zext_8i16_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq + +; SSSE3-LABEL: load_zext_8i16_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: retq + +; SSE41-LABEL: load_zext_8i16_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: retq + +; AVX1-LABEL: load_zext_8i16_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + +; AVX2-LABEL: load_zext_8i16_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxwd (%rdi), %ymm0 +; AVX2-NEXT: retq +entry: + %X = load <8 x i16>* %ptr + %Y = zext <8 x i16> %X to <8 x i32> + ret <8 x i32>%Y +} + +define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) { +; SSE2-LABEL: load_zext_4i32_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pshufd $-44, %xmm1, %xmm0 # xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq + +; SSSE3-LABEL: load_zext_4i32_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pshufd $-44, %xmm1, %xmm0 # xmm0 = xmm1[0,1,1,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: retq + +; SSE41-LABEL: load_zext_4i32_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: retq + +; AVX1-LABEL: load_zext_4i32_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + +; AVX2-LABEL: load_zext_4i32_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxdq (%rdi), %ymm0 +; AVX2-NEXT: retq +entry: + %X = load <4 x i32>* %ptr + %Y = zext <4 x i32> %X to <4 x i64> + ret <4 x i64>%Y +}