Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -702,17 +702,6 @@ } else Promoted = DAG.getNode(NewOpc, dl, NVT, Node->getOperand(0)); - // Assert that the converted value fits in the original type. If it doesn't - // (eg: because the value being converted is too big), then the result of the - // original operation was undefined anyway, so the assert is still correct. - if (Node->getOpcode() == ISD::FP_TO_UINT || - Node->getOpcode() == ISD::STRICT_FP_TO_UINT) - NewOpc = ISD::AssertZext; - else - NewOpc = ISD::AssertSext; - - Promoted = DAG.getNode(NewOpc, dl, NVT, Promoted, - DAG.getValueType(VT.getScalarType())); Promoted = DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted); Results.push_back(Promoted); if (IsStrict) Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30749,20 +30749,6 @@ } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); - // Preserve what we know about the size of the original result. If the - // result is v2i32, we have to manually widen the assert. - if (PromoteVT == MVT::v2i32) - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, - DAG.getUNDEF(MVT::v2i32)); - - Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl, - Res.getValueType(), Res, - DAG.getValueType(VT.getVectorElementType())); - - if (PromoteVT == MVT::v2i32) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - // Truncate back to the original width. Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); Index: llvm/test/CodeGen/X86/avx-cvt-2.ll =================================================================== --- llvm/test/CodeGen/X86/avx-cvt-2.ll +++ llvm/test/CodeGen/X86/avx-cvt-2.ll @@ -12,7 +12,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -26,7 +29,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -40,8 +46,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -55,8 +63,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/avx-fp2int.ll =================================================================== --- llvm/test/CodeGen/X86/avx-fp2int.ll +++ llvm/test/CodeGen/X86/avx-fp2int.ll @@ -7,8 +7,7 @@ ; CHECK-LABEL: test1: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %c = fptoui <4 x double> %d to <4 x i8> @@ -18,8 +17,7 @@ ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %c = fptosi <4 x double> %d to <4 x i8> Index: llvm/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-cvt.ll +++ llvm/test/CodeGen/X86/avx512-cvt.ll @@ -2356,6 +2356,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2364,6 +2365,7 @@ ; VLDQ-LABEL: test_4f64tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2371,6 +2373,7 @@ ; VLNODQ-LABEL: test_4f64tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2379,6 +2382,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2392,6 +2396,7 @@ ; NOVLDQ-LABEL: test_8f64tosb: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: retq @@ -2399,6 +2404,7 @@ ; VLDQ-LABEL: test_8f64tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vpmovd2m %ymm0, %k1 ; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2406,6 +2412,7 @@ ; VLNODQ-LABEL: test_8f64tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2413,6 +2420,7 @@ ; DQNOVL-LABEL: test_8f64tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq @@ -2426,6 +2434,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2435,6 +2444,7 @@ ; VLDQ-LABEL: test_2f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2442,6 +2452,7 @@ ; VLNODQ-LABEL: test_2f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2450,6 +2461,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2465,6 +2477,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2473,6 +2486,7 @@ ; VLDQ-LABEL: test_4f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2480,6 +2494,7 @@ ; VLNODQ-LABEL: test_4f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2488,6 +2503,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2501,6 +2517,7 @@ ; NOVLDQ-LABEL: test_8f32tosb: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: retq @@ -2508,6 +2525,7 @@ ; VLDQ-LABEL: test_8f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vpmovd2m %ymm0, %k1 ; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2515,6 +2533,7 @@ ; VLNODQ-LABEL: test_8f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2522,6 +2541,7 @@ ; DQNOVL-LABEL: test_8f32tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq @@ -2534,6 +2554,7 @@ ; NODQ-LABEL: test_16f32tosb: ; NODQ: # %bb.0: ; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vpslld $31, %zmm0, %zmm0 ; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; NODQ-NEXT: retq @@ -2541,6 +2562,7 @@ ; VLDQ-LABEL: test_16f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; VLDQ-NEXT: vpmovd2m %zmm0, %k1 ; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2548,6 +2570,7 @@ ; DQNOVL-LABEL: test_16f32tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -627,8 +627,10 @@ ; CHECK-LABEL: test_16f32tosb_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 ; CHECK-NEXT: vpmovd2m %ymm1, %k0 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 +; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 ; CHECK-NEXT: vpmovd2m %ymm1, %k1 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} @@ -643,6 +645,7 @@ ; CHECK-LABEL: test_16f32tosb_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 +; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 ; CHECK-NEXT: vpmovd2m %zmm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/pr48727.ll =================================================================== --- llvm/test/CodeGen/X86/pr48727.ll +++ llvm/test/CodeGen/X86/pr48727.ll @@ -5,15 +5,19 @@ ; CHECK-LABEL: PR48727: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vcvttpd2dqy 0, %xmm0 +; CHECK-NEXT: vpmovdw %xmm0, %xmm0 ; CHECK-NEXT: vcvttpd2dqy 128, %xmm1 -; CHECK-NEXT: movq (%rax), %rax +; CHECK-NEXT: vpmovdw %xmm1, %xmm1 ; CHECK-NEXT: vcvttpd2dqy 160, %xmm2 +; CHECK-NEXT: vpmovdw %xmm2, %xmm2 +; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vcvttpd2dqy (%rax), %xmm2 +; CHECK-NEXT: vpmovdw %xmm2, %xmm2 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpmovdw %zmm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 16(%rax) +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; CHECK-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqu %ymm2, 16(%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: Index: llvm/test/CodeGen/X86/tmp/rep.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tmp/rep.ll @@ -0,0 +1,45 @@ +; llc reproducer.ll -mcpu=corei7-avx -O0 -o reproducerr.s + +declare <8 x float> @__ocl_svml_e9_cosf8_native(<8 x float>) local_unnamed_addr + +define <32 x i8> @splat_v3i32(<8 x float>* %ptr1, <8 x float>* %ptr2) #0 { + + %1 = load <8 x float>, <8 x float>* %ptr1, align 4 + %2 = load <8 x float>, <8 x float>* %ptr2, align 4 + + %i375 = fmul <8 x float> %1, + br label %VPlannedBB181vector_func.i7zxzx + +VPlannedBB181vector_func.i7zxzx: ; preds = %VPlannedBB181vector_func.i6zxzx + %i376 = fptoui <8 x float> %i375 to <8 x i8> + %i377 = fmul <8 x float> %2, + %i378 = fadd <8 x float> %i377, + %call.i23.i = call contract <8 x float> @__ocl_svml_e9_cosf8_native(<8 x float> %i378) + %i379 = fadd <8 x float> %call.i23.i, + %i380 = fmul <8 x float> %i379, + %i381 = fmul <8 x float> %i380, + %i382 = fptoui <8 x float> %i381 to <8 x i8> + %i383 = fadd <8 x float> %2, + %call.i24.i = call contract <8 x float> @__ocl_svml_e9_cosf8_native(<8 x float> %i383) + %i384 = fadd <8 x float> %call.i24.i, + %i385 = fmul <8 x float> %i384, + %i386 = fmul <8 x float> %i385, + %i387 = fptoui <8 x float> %i386 to <8 x i8> + + %extended.223vector_func.i = shufflevector <8 x i8> %i376, <8 x i8> undef, <32 x i32> + + %wide.insert224vector_func.i = shufflevector <32 x i8> , <32 x i8> %extended.223vector_func.i, <32 x i32> + + %extended.225vector_func.i = shufflevector <8 x i8> %i382, <8 x i8> undef, <32 x i32> + + %wide.insert226vector_func.i = shufflevector <32 x i8> %wide.insert224vector_func.i, <32 x i8> %extended.225vector_func.i, <32 x i32> + br label %VPlannedBB181vector_func.iend + +VPlannedBB181vector_func.iend: ; preds = %VPlannedBB181vector_func.i7zxzx + %extended.227vector_func.i = shufflevector <8 x i8> %i387, <8 x i8> undef, <32 x i32> + + %wide.insert228vector_func.i = shufflevector <32 x i8> %wide.insert226vector_func.i, <32 x i8> %extended.227vector_func.i, <32 x i32> + ret <32 x i8> %wide.insert228vector_func.i +} + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="none" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "stackrealign" "uniform-work-group-size"="true" "vector-variants"="_ZGVcN8uuuuuuu_mande lbrot_vector" } Index: llvm/test/CodeGen/X86/tmp/rep_old.s =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tmp/rep_old.s @@ -0,0 +1,204 @@ + .text + .file "rep.ll" + .section .rodata.cst32,"aM",@progbits,32 + .p2align 5 # -- Begin function splat_v3i32 +.LCPI0_0: + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 +.LCPI0_1: + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 +.LCPI0_2: + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 +.LCPI0_3: + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 +.LCPI0_4: + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_5: + .byte 128 # 0x80 + .byte 4 # 0x4 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .zero 1 + .byte 128 # 0x80 +.LCPI0_6: + .byte 128 # 0x80 + .byte 0 # 0x0 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .zero 1 + .byte 128 # 0x80 +.LCPI0_7: + .byte 0 # 0x0 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 4 # 0x4 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 8 # 0x8 + .byte 9 # 0x9 + .byte 128 # 0x80 + .byte 11 # 0xb + .byte 12 # 0xc + .byte 13 # 0xd + .byte 128 # 0x80 + .byte 15 # 0xf +.LCPI0_8: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 4 # 0x4 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 128 # 0x80 +.LCPI0_9: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 0 # 0x0 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 128 # 0x80 + .text + .globl splat_v3i32 + .p2align 4, 0x90 + .type splat_v3i32,@function +splat_v3i32: # @splat_v3i32 +# %bb.0: + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $96, %rsp + vmovups (%rdi), %xmm0 + vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 + vmovups (%rsi), %xmm1 + vinsertf128 $1, 16(%rsi), %ymm1, %ymm2 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, 16(%rsp) # 16-byte Spill + vaddps %ymm2, %ymm2, %ymm0 + vaddps .LCPI0_1(%rip), %ymm0, %ymm0 + vmovaps %ymm2, 32(%rsp) # 32-byte Spill + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, (%rsp) # 16-byte Spill + vmovaps 32(%rsp), %ymm0 # 32-byte Reload + vaddps .LCPI0_4(%rip), %ymm0, %ymm0 + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vpackuswb %xmm0, %xmm0, %xmm0 + vpcmpeqd %xmm1, %xmm1, %xmm1 + vmovdqa 16(%rsp), %xmm3 # 16-byte Reload + vpunpcklwd %xmm1, %xmm3, %xmm2 # xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vpshufd $238, %xmm3, %xmm3 # xmm3 = xmm3[2,3,2,3] + vpunpcklwd %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vmovdqa (%rsp), %xmm3 # 16-byte Reload + vpackuswb %xmm3, %xmm3, %xmm3 + vpshufb .LCPI0_5(%rip), %xmm3, %xmm4 # xmm4 = zero,xmm3[4,u],zero,zero,xmm3[5,u],zero,zero,xmm3[6,u],zero,zero,xmm3[7,u],zero + vpor %xmm4, %xmm1, %xmm1 + vpshufb .LCPI0_6(%rip), %xmm3, %xmm3 # xmm3 = zero,xmm3[0,u],zero,zero,xmm3[1,u],zero,zero,xmm3[2,u],zero,zero,xmm3[3,u],zero + vpor %xmm3, %xmm2, %xmm2 + vinsertf128 $1, %xmm1, %ymm2, %ymm1 + vextractf128 $1, %ymm1, %xmm2 + vmovdqa .LCPI0_7(%rip), %xmm3 # xmm3 = [0,1,128,3,4,5,128,7,8,9,128,11,12,13,128,15] + vpshufb %xmm3, %xmm2, %xmm2 + vpshufb .LCPI0_8(%rip), %xmm0, %xmm4 # xmm4 = zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero + vpor %xmm4, %xmm2, %xmm2 + vpshufb %xmm3, %xmm1, %xmm1 + vpshufb .LCPI0_9(%rip), %xmm0, %xmm0 # xmm0 = zero,zero,xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero + vpor %xmm0, %xmm1, %xmm0 + vinsertf128 $1, %xmm2, %ymm0, %ymm0 + movq %rbp, %rsp + popq %rbp + retq +.Lfunc_end0: + .size splat_v3i32, .Lfunc_end0-splat_v3i32 + # -- End function + .section ".note.GNU-stack","",@progbits Index: llvm/test/CodeGen/X86/tmp/rep_rm_simons_rG2a419a0b9957ebac9e11e4b43bc9fbe42a9207df.s =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tmp/rep_rm_simons_rG2a419a0b9957ebac9e11e4b43bc9fbe42a9207df.s @@ -0,0 +1,224 @@ + .text + .file "rep.ll" + .section .rodata.cst32,"aM",@progbits,32 + .p2align 5 # -- Begin function splat_v3i32 +.LCPI0_0: + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 +.LCPI0_1: + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 +.LCPI0_2: + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 +.LCPI0_3: + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 +.LCPI0_4: + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_5: + .byte 128 # 0x80 + .byte 4 # 0x4 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .zero 1 + .byte 128 # 0x80 +.LCPI0_6: + .byte 0 # 0x0 + .byte 128 # 0x80 + .zero 1 + .byte 3 # 0x3 + .byte 4 # 0x4 + .byte 128 # 0x80 + .zero 1 + .byte 7 # 0x7 + .byte 8 # 0x8 + .byte 128 # 0x80 + .zero 1 + .byte 11 # 0xb + .byte 12 # 0xc + .byte 128 # 0x80 + .zero 1 + .byte 15 # 0xf +.LCPI0_7: + .byte 128 # 0x80 + .byte 0 # 0x0 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .zero 1 + .byte 128 # 0x80 +.LCPI0_8: + .byte 0 # 0x0 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 4 # 0x4 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 8 # 0x8 + .byte 9 # 0x9 + .byte 128 # 0x80 + .byte 11 # 0xb + .byte 12 # 0xc + .byte 13 # 0xd + .byte 128 # 0x80 + .byte 15 # 0xf +.LCPI0_9: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 4 # 0x4 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 128 # 0x80 +.LCPI0_10: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 0 # 0x0 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 128 # 0x80 + .text + .globl splat_v3i32 + .p2align 4, 0x90 + .type splat_v3i32,@function +splat_v3i32: # @splat_v3i32 +# %bb.0: + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $96, %rsp + vmovups (%rdi), %xmm0 + vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 + vmovups (%rsi), %xmm1 + vinsertf128 $1, 16(%rsi), %ymm1, %ymm2 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, 16(%rsp) # 16-byte Spill + vaddps %ymm2, %ymm2, %ymm0 + vaddps .LCPI0_1(%rip), %ymm0, %ymm0 + vmovaps %ymm2, 32(%rsp) # 32-byte Spill + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, (%rsp) # 16-byte Spill + vmovaps 32(%rsp), %ymm0 # 32-byte Reload + vaddps .LCPI0_4(%rip), %ymm0, %ymm0 + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vpackuswb %xmm0, %xmm0, %xmm0 + vpcmpeqd %xmm1, %xmm1, %xmm1 + vmovdqa 16(%rsp), %xmm3 # 16-byte Reload + vpunpcklwd %xmm1, %xmm3, %xmm2 # xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vpshufd $238, %xmm3, %xmm3 # xmm3 = xmm3[2,3,2,3] + vpunpcklwd %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vmovdqa (%rsp), %xmm3 # 16-byte Reload + vpackuswb %xmm3, %xmm3, %xmm3 + vpshufb .LCPI0_5(%rip), %xmm3, %xmm4 # xmm4 = zero,xmm3[4,u],zero,zero,xmm3[5,u],zero,zero,xmm3[6,u],zero,zero,xmm3[7,u],zero + vmovdqa .LCPI0_6(%rip), %xmm5 # xmm5 = <0,128,u,3,4,128,u,7,8,128,u,11,12,128,u,15> + vpshufb %xmm5, %xmm1, %xmm1 + vpshufb .LCPI0_7(%rip), %xmm3, %xmm3 # xmm3 = zero,xmm3[0,u],zero,zero,xmm3[1,u],zero,zero,xmm3[2,u],zero,zero,xmm3[3,u],zero + vpor %xmm4, %xmm1, %xmm1 + vpshufb %xmm5, %xmm2, %xmm2 + vpor %xmm3, %xmm2, %xmm2 + vinsertf128 $1, %xmm1, %ymm2, %ymm1 + vextractf128 $1, %ymm1, %xmm2 + vmovdqa .LCPI0_8(%rip), %xmm3 # xmm3 = [0,1,128,3,4,5,128,7,8,9,128,11,12,13,128,15] + vpshufb %xmm3, %xmm2, %xmm2 + vpshufb .LCPI0_9(%rip), %xmm0, %xmm4 # xmm4 = zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero + vpor %xmm4, %xmm2, %xmm2 + vpshufb %xmm3, %xmm1, %xmm1 + vpshufb .LCPI0_10(%rip), %xmm0, %xmm0 # xmm0 = zero,zero,xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero + vpor %xmm0, %xmm1, %xmm0 + vinsertf128 $1, %xmm2, %ymm0, %ymm0 + movq %rbp, %rsp + popq %rbp + retq +.Lfunc_end0: + .size splat_v3i32, .Lfunc_end0-splat_v3i32 + # -- End function + .section ".note.GNU-stack","",@progbits Index: llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -1829,43 +1829,43 @@ ; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #0 @@ -1888,31 +1888,31 @@ ; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #0 @@ -1924,49 +1924,49 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f32(<2 x float> %a, metadata !"fpexcept.strict") #0 @@ -1992,35 +1992,35 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f32(<2 x float> %a, metadata !"fpexcept.strict") #0 @@ -2031,29 +2031,29 @@ ; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 -; SSE-32-NEXT: packsswb %xmm0, %xmm0 +; SSE-32-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 -; SSE-64-NEXT: packsswb %xmm0, %xmm0 +; SSE-64-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i8: @@ -2065,8 +2065,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: @@ -2083,6 +2082,7 @@ ; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl @@ -2090,6 +2090,7 @@ ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq @@ -2097,15 +2098,13 @@ ; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i8: @@ -2117,8 +2116,7 @@ ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: @@ -2136,32 +2134,32 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 -; SSE-32-NEXT: packsswb %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 -; SSE-64-NEXT: packsswb %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f32_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i8: @@ -2175,8 +2173,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i8: @@ -2195,6 +2192,7 @@ ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl @@ -2203,6 +2201,7 @@ ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq @@ -2211,16 +2210,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f32_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i8: @@ -2234,8 +2231,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i8: @@ -3110,29 +3106,29 @@ ; SSE-32-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 -; SSE-32-NEXT: packsswb %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 -; SSE-64-NEXT: packsswb %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i8: @@ -3144,8 +3140,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8: @@ -3162,6 +3157,7 @@ ; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: packuswb %xmm0, %xmm0 ; SSE-32-NEXT: retl @@ -3169,6 +3165,7 @@ ; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: packuswb %xmm0, %xmm0 ; SSE-64-NEXT: retq @@ -3176,15 +3173,13 @@ ; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v4f32_to_v4i8: @@ -3196,8 +3191,7 @@ ; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8: @@ -3229,6 +3223,7 @@ ; AVX512F-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3238,6 +3233,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -3246,6 +3242,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3255,6 +3252,7 @@ ; AVX512VLDQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vpmovd2m %xmm0, %k0 ; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} Index: llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1164,7 +1164,7 @@ ; CHECK-LABEL: strict_vector_fptosi_v4f64_to_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f64(<4 x double> %a, @@ -1176,7 +1176,7 @@ ; CHECK-LABEL: strict_vector_fptoui_v4f64_to_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f64(<4 x double> %a, @@ -1188,16 +1188,14 @@ ; AVX-LABEL: strict_vector_fptosi_v4f64_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v4f64_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; @@ -1211,8 +1209,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; @@ -1231,16 +1228,14 @@ ; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v4f64_to_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; @@ -1254,8 +1249,7 @@ ; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; @@ -1280,6 +1274,7 @@ ; AVX512F-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1289,6 +1284,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -1298,6 +1294,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1307,6 +1304,7 @@ ; AVX512DQVL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpmovd2m %xmm0, %k0 ; AVX512DQVL-NEXT: vpmovm2d %k0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1423,7 +1421,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1466,7 +1467,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1509,8 +1513,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1551,8 +1557,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1593,13 +1601,17 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -1610,6 +1622,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -1620,6 +1633,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -1630,6 +1644,7 @@ ; AVX512DQVL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmovd2m %ymm0, %k0 ; AVX512DQVL-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 @@ -1645,7 +1660,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; Index: llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -716,6 +716,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -726,6 +727,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -834,6 +836,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v16f32_to_v16i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -843,6 +846,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v16f32_to_v16i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 Index: llvm/test/CodeGen/X86/vec_cast2.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast2.ll +++ llvm/test/CodeGen/X86/vec_cast2.ll @@ -98,8 +98,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> @@ -111,7 +113,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptosi <8 x float> %src to <8 x i16> @@ -122,8 +127,7 @@ ; CHECK-LABEL: cvt_v4f32_v4i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -133,7 +137,7 @@ ; CHECK-LABEL: cvt_v4f32_v4i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: retl %res = fptosi <4 x float> %src to <4 x i16> ret <4 x i16> %res @@ -144,8 +148,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptoui <8 x float> %src to <8 x i8> @@ -157,7 +163,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptoui <8 x float> %src to <8 x i16> @@ -168,8 +177,7 @@ ; CHECK-LABEL: cvt_v4f32_v4u8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptoui <4 x float> %src to <4 x i8> ret <4 x i8> %res @@ -179,7 +187,7 @@ ; CHECK-LABEL: cvt_v4f32_v4u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: retl %res = fptoui <4 x float> %src to <4 x i16> ret <4 x i16> %res Index: llvm/test/CodeGen/X86/vec_cast3.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast3.ll +++ llvm/test/CodeGen/X86/vec_cast3.ll @@ -67,8 +67,7 @@ ; CHECK-LABEL: cvt_v2f32_v2i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptosi <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -78,7 +77,7 @@ ; CHECK-LABEL: cvt_v2f32_v2i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res @@ -97,8 +96,7 @@ ; CHECK-LABEL: cvt_v2f32_v2u8: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i8> ret <2 x i8> %res @@ -108,7 +106,7 @@ ; CHECK-LABEL: cvt_v2f32_v2u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res Index: llvm/test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2297,22 +2297,21 @@ ; SSE-LABEL: fptosi_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f32_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f32_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f32_to_2i8: @@ -2324,8 +2323,7 @@ ; AVX512DQ-LABEL: fptosi_2f32_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8: @@ -2341,13 +2339,13 @@ ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt @@ -2357,6 +2355,7 @@ ; SSE-LABEL: fptoui_2f32_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq @@ -2364,15 +2363,13 @@ ; VEX-LABEL: fptoui_2f32_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 -; VEX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i8: @@ -2384,8 +2381,7 @@ ; AVX512DQ-LABEL: fptoui_2f32_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8: @@ -2407,7 +2403,7 @@ ; AVX-LABEL: fptoui_2f32_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt @@ -2417,22 +2413,21 @@ ; SSE-LABEL: fptosi_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f64_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f64_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f64_to_2i8: @@ -2444,8 +2439,7 @@ ; AVX512DQ-LABEL: fptosi_2f64_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8: @@ -2461,13 +2455,13 @@ ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt @@ -2477,6 +2471,7 @@ ; SSE-LABEL: fptoui_2f64_to_2i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq @@ -2484,15 +2479,13 @@ ; VEX-LABEL: fptoui_2f64_to_2i8: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f64_to_2i8: @@ -2504,8 +2497,7 @@ ; AVX512DQ-LABEL: fptoui_2f64_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8: @@ -2527,7 +2519,7 @@ ; AVX-LABEL: fptoui_2f64_to_2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt @@ -2536,20 +2528,27 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptosi_8f64_to_8i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: cvttpd2dq %xmm3, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_8f64_to_8i16: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; VEX-NEXT: vzeroupper ; VEX-NEXT: retq ; @@ -2589,25 +2588,26 @@ define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { ; SSE-LABEL: fptoui_8f64_to_8i16: ; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: cvttpd2dq %xmm3, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_8f64_to_8i16: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 +; VEX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; VEX-NEXT: vzeroupper ; VEX-NEXT: retq @@ -2649,35 +2649,52 @@ ; SSE-LABEL: fptosi_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptosi_16f32_to_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptosi_16f32_to_16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2695,11 +2712,16 @@ ; SSE-LABEL: fptoui_16f32_to_16i8: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm3, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 ; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -2707,10 +2729,18 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2718,11 +2748,15 @@ ; AVX2-LABEL: fptoui_16f32_to_16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq