Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -702,17 +702,6 @@ } else Promoted = DAG.getNode(NewOpc, dl, NVT, Node->getOperand(0)); - // Assert that the converted value fits in the original type. If it doesn't - // (eg: because the value being converted is too big), then the result of the - // original operation was undefined anyway, so the assert is still correct. - if (Node->getOpcode() == ISD::FP_TO_UINT || - Node->getOpcode() == ISD::STRICT_FP_TO_UINT) - NewOpc = ISD::AssertZext; - else - NewOpc = ISD::AssertSext; - - Promoted = DAG.getNode(NewOpc, dl, NVT, Promoted, - DAG.getValueType(VT.getScalarType())); Promoted = DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted); Results.push_back(Promoted); if (IsStrict) Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30749,20 +30749,6 @@ } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); - // Preserve what we know about the size of the original result. If the - // result is v2i32, we have to manually widen the assert. - if (PromoteVT == MVT::v2i32) - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, - DAG.getUNDEF(MVT::v2i32)); - - Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl, - Res.getValueType(), Res, - DAG.getValueType(VT.getVectorElementType())); - - if (PromoteVT == MVT::v2i32) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - // Truncate back to the original width. Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); Index: llvm/test/CodeGen/X86/avx-cvt-2.ll =================================================================== --- llvm/test/CodeGen/X86/avx-cvt-2.ll +++ llvm/test/CodeGen/X86/avx-cvt-2.ll @@ -12,7 +12,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -26,7 +29,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -40,8 +46,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -55,8 +63,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/avx-fp2int.ll =================================================================== --- llvm/test/CodeGen/X86/avx-fp2int.ll +++ llvm/test/CodeGen/X86/avx-fp2int.ll @@ -7,8 +7,7 @@ ; CHECK-LABEL: test1: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %c = fptoui <4 x double> %d to <4 x i8> @@ -18,8 +17,7 @@ ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %c = fptosi <4 x double> %d to <4 x i8> Index: llvm/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-cvt.ll +++ llvm/test/CodeGen/X86/avx512-cvt.ll @@ -2356,6 +2356,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2364,6 +2365,7 @@ ; VLDQ-LABEL: test_4f64tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2371,6 +2373,7 @@ ; VLNODQ-LABEL: test_4f64tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2379,6 +2382,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2392,6 +2396,7 @@ ; NOVLDQ-LABEL: test_8f64tosb: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: retq @@ -2399,6 +2404,7 @@ ; VLDQ-LABEL: test_8f64tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vpmovd2m %ymm0, %k1 ; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2406,6 +2412,7 @@ ; VLNODQ-LABEL: test_8f64tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2413,6 +2420,7 @@ ; DQNOVL-LABEL: test_8f64tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq @@ -2426,6 +2434,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2435,6 +2444,7 @@ ; VLDQ-LABEL: test_2f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2442,6 +2452,7 @@ ; VLNODQ-LABEL: test_2f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2450,6 +2461,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2465,6 +2477,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2473,6 +2486,7 @@ ; VLDQ-LABEL: test_4f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2480,6 +2494,7 @@ ; VLNODQ-LABEL: test_4f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2488,6 +2503,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2501,6 +2517,7 @@ ; NOVLDQ-LABEL: test_8f32tosb: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: retq @@ -2508,6 +2525,7 @@ ; VLDQ-LABEL: test_8f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vpmovd2m %ymm0, %k1 ; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2515,6 +2533,7 @@ ; VLNODQ-LABEL: test_8f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2522,6 +2541,7 @@ ; DQNOVL-LABEL: test_8f32tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq @@ -2534,6 +2554,7 @@ ; NODQ-LABEL: test_16f32tosb: ; NODQ: # %bb.0: ; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vpslld $31, %zmm0, %zmm0 ; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; NODQ-NEXT: retq @@ -2541,6 +2562,7 @@ ; VLDQ-LABEL: test_16f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; VLDQ-NEXT: vpmovd2m %zmm0, %k1 ; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2548,6 +2570,7 @@ ; DQNOVL-LABEL: test_16f32tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq Index: llvm/test/CodeGen/X86/fptoui-may-overflow.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/fptoui-may-overflow.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s + +; We can't only generate "cvttps2dq %xmm0, %xmm0" for this function, because +; there may be overflow case in fptoui. The old DAG optimization will optimize +; "%f = fptoui <4 x float> %arg to <4 x i8>" to +; "%f = fptosi <4 x float> %arg to <4 x i16>" + "AssertZext %f, ValueType:ch:i8" +; that is not equal when overflow occur in fptoui. + +define <16 x i8> @src(<4 x float> %arg) { +; CHECK-LABEL: src: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %f = fptoui <4 x float> %arg to <4 x i8> + %s = shufflevector <4 x i8> %f, <4 x i8> undef, <16 x i32> + %ss = shufflevector <16 x i8> %s, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %ss +} Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -627,8 +627,10 @@ ; CHECK-LABEL: test_16f32tosb_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 ; CHECK-NEXT: vpmovd2m %ymm1, %k0 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 +; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 ; CHECK-NEXT: vpmovd2m %ymm1, %k1 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} @@ -643,6 +645,7 @@ ; CHECK-LABEL: test_16f32tosb_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 +; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 ; CHECK-NEXT: vpmovd2m %zmm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/pr48727.ll =================================================================== --- llvm/test/CodeGen/X86/pr48727.ll +++ llvm/test/CodeGen/X86/pr48727.ll @@ -5,15 +5,19 @@ ; CHECK-LABEL: PR48727: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vcvttpd2dqy 0, %xmm0 +; CHECK-NEXT: vpmovdw %xmm0, %xmm0 ; CHECK-NEXT: vcvttpd2dqy 128, %xmm1 -; CHECK-NEXT: movq (%rax), %rax +; CHECK-NEXT: vpmovdw %xmm1, %xmm1 ; CHECK-NEXT: vcvttpd2dqy 160, %xmm2 +; CHECK-NEXT: vpmovdw %xmm2, %xmm2 +; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vcvttpd2dqy (%rax), %xmm2 +; CHECK-NEXT: vpmovdw %xmm2, %xmm2 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpmovdw %zmm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 16(%rax) +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; CHECK-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqu %ymm2, 16(%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: Index: llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -1829,43 +1829,43 @@ ; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #0 @@ -1888,31 +1888,31 @@ ; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #0 @@ -1924,49 +1924,49 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f32(<2 x float> %a, metadata !"fpexcept.strict") #0 @@ -1992,35 +1992,35 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f32(<2 x float> %a, metadata !"fpexcept.strict") #0 @@ -2031,29 +2031,29 @@ ; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 -; SSE-32-NEXT: packsswb %xmm0, %xmm0 +; SSE-32-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 -; SSE-64-NEXT: packsswb %xmm0, %xmm0 +; SSE-64-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i8: @@ -2065,8 +2065,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: @@ -2083,6 +2082,7 @@ ; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl @@ -2090,6 +2090,7 @@ ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq @@ -2097,15 +2098,13 @@ ; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i8: @@ -2117,8 +2116,7 @@ ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: @@ -2136,32 +2134,32 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 -; SSE-32-NEXT: packsswb %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 -; SSE-64-NEXT: packsswb %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f32_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i8: @@ -2175,8 +2173,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i8: @@ -2195,6 +2192,7 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl @@ -2203,6 +2201,7 @@ ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq @@ -2211,16 +2210,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f32_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i8: @@ -2234,8 +2231,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i8: @@ -3110,29 +3106,29 @@ ; SSE-32-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 -; SSE-32-NEXT: packsswb %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 -; SSE-64-NEXT: packsswb %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i8: @@ -3144,8 +3140,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8: @@ -3162,6 +3157,7 @@ ; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl @@ -3169,6 +3165,7 @@ ; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq @@ -3176,15 +3173,13 @@ ; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v4f32_to_v4i8: @@ -3196,8 +3191,7 @@ ; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8: @@ -3229,6 +3223,7 @@ ; AVX512F-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3238,6 +3233,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -3246,6 +3242,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3255,6 +3252,7 @@ ; AVX512VLDQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vpmovd2m %xmm0, %k0 ; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} Index: llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1164,7 +1164,7 @@ ; CHECK-LABEL: strict_vector_fptosi_v4f64_to_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f64(<4 x double> %a, @@ -1176,7 +1176,7 @@ ; CHECK-LABEL: strict_vector_fptoui_v4f64_to_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f64(<4 x double> %a, @@ -1188,16 +1188,14 @@ ; AVX-LABEL: strict_vector_fptosi_v4f64_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v4f64_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; @@ -1211,8 +1209,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; @@ -1231,16 +1228,14 @@ ; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v4f64_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; @@ -1254,8 +1249,7 @@ ; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; @@ -1280,6 +1274,7 @@ ; AVX512F-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1289,6 +1284,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -1298,6 +1294,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1307,6 +1304,7 @@ ; AVX512DQVL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpmovd2m %xmm0, %k0 ; AVX512DQVL-NEXT: vpmovm2d %k0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1423,7 +1421,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1466,7 +1467,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1509,8 +1513,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1551,8 +1557,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1593,13 +1601,17 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -1610,6 +1622,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -1620,6 +1633,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -1630,6 +1644,7 @@ ; AVX512DQVL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmovd2m %ymm0, %k0 ; AVX512DQVL-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 @@ -1645,7 +1660,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; Index: llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -716,6 +716,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -726,6 +727,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -834,6 +836,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v16f32_to_v16i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -843,6 +846,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v16f32_to_v16i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 Index: llvm/test/CodeGen/X86/vec_cast2.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast2.ll +++ llvm/test/CodeGen/X86/vec_cast2.ll @@ -98,8 +98,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> @@ -111,7 +113,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptosi <8 x float> %src to <8 x i16> @@ -122,8 +127,7 @@ ; CHECK-LABEL: cvt_v4f32_v4i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -133,7 +137,7 @@ ; CHECK-LABEL: cvt_v4f32_v4i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: retl %res = fptosi <4 x float> %src to <4 x i16> ret <4 x i16> %res @@ -144,8 +148,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptoui <8 x float> %src to <8 x i8> @@ -157,7 +163,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptoui <8 x float> %src to <8 x i16> @@ -168,8 +177,7 @@ ; CHECK-LABEL: cvt_v4f32_v4u8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptoui <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -179,7 +187,7 @@ ; CHECK-LABEL: cvt_v4f32_v4u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: retl %res = fptoui <4 x float> %src to <4 x i16> ret <4 x i16> %res Index: llvm/test/CodeGen/X86/vec_cast3.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast3.ll +++ llvm/test/CodeGen/X86/vec_cast3.ll @@ -67,8 +67,7 @@ ; CHECK-LABEL: cvt_v2f32_v2i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptosi <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -78,7 +77,7 @@ ; CHECK-LABEL: cvt_v2f32_v2i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res @@ -97,8 +96,7 @@ ; CHECK-LABEL: cvt_v2f32_v2u8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -108,7 +106,7 @@ ; CHECK-LABEL: cvt_v2f32_v2u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res Index: llvm/test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2297,22 +2297,21 @@ ; SSE-LABEL: fptosi_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f32_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f32_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f32_to_2i8: @@ -2324,8 +2323,7 @@ ; AVX512DQ-LABEL: fptosi_2f32_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8: @@ -2341,13 +2339,13 @@ ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt @@ -2357,6 +2355,7 @@ ; SSE-LABEL: fptoui_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq @@ -2364,15 +2363,13 @@ ; VEX-LABEL: fptoui_2f32_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i8: @@ -2384,8 +2381,7 @@ ; AVX512DQ-LABEL: fptoui_2f32_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8: @@ -2407,7 +2403,7 @@ ; AVX-LABEL: fptoui_2f32_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt @@ -2417,22 +2413,21 @@ ; SSE-LABEL: fptosi_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f64_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f64_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f64_to_2i8: @@ -2444,8 +2439,7 @@ ; AVX512DQ-LABEL: fptosi_2f64_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8: @@ -2461,13 +2455,13 @@ ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt @@ -2477,6 +2471,7 @@ ; SSE-LABEL: fptoui_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq @@ -2484,15 +2479,13 @@ ; VEX-LABEL: fptoui_2f64_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f64_to_2i8: @@ -2504,8 +2497,7 @@ ; AVX512DQ-LABEL: fptoui_2f64_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8: @@ -2527,7 +2519,7 @@ ; AVX-LABEL: fptoui_2f64_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt @@ -2536,20 +2528,27 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptosi_8f64_to_8i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: cvttpd2dq %xmm3, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_8f64_to_8i16: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; VEX-NEXT: vzeroupper ; VEX-NEXT: retq ; @@ -2589,25 +2588,26 @@ define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptoui_8f64_to_8i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: cvttpd2dq %xmm3, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_8f64_to_8i16: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; VEX-NEXT: vzeroupper ; VEX-NEXT: retq @@ -2649,35 +2649,52 @@ ; SSE-LABEL: fptosi_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptosi_16f32_to_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptosi_16f32_to_16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2695,11 +2712,16 @@ ; SSE-LABEL: fptoui_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -2707,10 +2729,18 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2718,11 +2748,15 @@ ; AVX2-LABEL: fptoui_16f32_to_16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq