Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -6077,6 +6077,10 @@ (VCVTPS2PDZrm addr:$src)>; let Predicates = [HasVLX] in { + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), + (VCVTPD2PSZ128rr VR128X:$src)>; def : Pat<(v2f64 (extloadv2f32 addr:$src)), (VCVTPS2PDZ128rm addr:$src)>; def : Pat<(v4f64 (extloadv4f32 addr:$src)), @@ -6148,8 +6152,8 @@ } // Convert Double to Signed/Unsigned Doubleword with truncation -multiclass avx512_cvttpd2dq opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvttpd2dq opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, SDNode OpNodeRnd> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp, avx512_vcvt_fp_sae, EVEX_V128; + defm Z128 : avx512_vcvt_fp, EVEX_V128; defm Z256 : avx512_vcvt_fp, EVEX_V256; } @@ -6302,7 +6306,7 @@ X86cvttp2siRnd>, XS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttpd2dq, X86cvttp2siRnd>, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -6310,7 +6314,7 @@ X86cvttp2uiRnd>, PS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, +defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, fp_to_uint, X86cvttp2uiRnd>, PS, VEX_W, EVEX_CD8<64, CD8VF>; @@ -6408,13 +6412,10 @@ } let Predicates = [HasAVX512, HasVLX] in { - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))), (VCVTTPD2DQZ128rr VR128:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src))), - (VCVTTPD2DQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))), - (VCVTTPD2DQZ128rm addr:$src)>; } let Predicates = [HasAVX512] in { Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -2065,11 +2065,12 @@ (CVTTPS2DQrm addr:$src)>; } +let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src))))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. @@ -2078,10 +2079,11 @@ // XMM only def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; +let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dqx\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (loadv2f64 addr:$src)))], + [(set VR128:$dst, + (v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))))], IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; // YMM only @@ -2099,13 +2101,10 @@ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))), - (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), (VCVTTPD2DQrr VR128:$src)>; - def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))), - (VCVTTPD2DQXrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; @@ -2125,8 +2124,9 @@ Sched<[WriteCvtF2ILd]>; let Predicates = [UseSSE2] in { - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))), (CVTTPD2DQrr VR128:$src)>; def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))), (CVTTPD2DQrr VR128:$src)>; @@ -2254,8 +2254,9 @@ let Predicates = [HasAVX, NoVLX] in { // Match fpround and fpextend for 128/256-bit conversions - def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), (VCVTPD2PSrr VR128:$src)>; def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (VCVTPD2PSrr VR128:$src)>; @@ -2272,8 +2273,9 @@ let Predicates = [UseSSE2] in { // Match fpround and fpextend for 128 conversions - def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))))), + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), (CVTPD2PSrr VR128:$src)>; def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (CVTPD2PSrr VR128:$src)>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -574,7 +574,7 @@ X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VFPEXTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, 0), + X86ISD::CVTTPD2DQ, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, @@ -1636,6 +1636,7 @@ X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTPD2DQ, 0), X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -338,10 +338,15 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) { -; CHECK-LABEL: test_x86_sse2_cvttpd2dq: -; CHECK: ## BB#0: -; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; CHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse2_cvttpd2dq: +; AVX: ## BB#0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] +; AVX-NEXT: retl ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0] +; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -324,8 +324,6 @@ ; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext: ; SKX: ## BB#0: ; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0] -; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero ; SKX-NEXT: retl ## encoding: [0xc3] %cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) %res = shufflevector <4 x float> %cvt, <4 x float> zeroinitializer, <4 x i32> @@ -502,10 +500,15 @@ ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse2_cvttpd2dq: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse2_cvttpd2dq: +; AVX2: ## BB#0: +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_cvttpd2dq: +; SKX: ## BB#0: +; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -516,22 +519,16 @@ ; SSE-LABEL: test_mm_cvttpd_epi32_zext: ; SSE: ## BB#0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0] -; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0],zero ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_mm_cvttpd_epi32_zext: ; AVX2: ## BB#0: ; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; AVX2-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] -; AVX2-NEXT: ## xmm0 = xmm0[0],zero ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_mm_cvttpd_epi32_zext: ; SKX: ## BB#0: -; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0] -; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0] -; SKX-NEXT: ## xmm0 = xmm0[0],zero +; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0] ; SKX-NEXT: retl ## encoding: [0xc3] %cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) %res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32>