Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -702,17 +702,6 @@ } else Promoted = DAG.getNode(NewOpc, dl, NVT, Node->getOperand(0)); - // Assert that the converted value fits in the original type. If it doesn't - // (eg: because the value being converted is too big), then the result of the - // original operation was undefined anyway, so the assert is still correct. - if (Node->getOpcode() == ISD::FP_TO_UINT || - Node->getOpcode() == ISD::STRICT_FP_TO_UINT) - NewOpc = ISD::AssertZext; - else - NewOpc = ISD::AssertSext; - - Promoted = DAG.getNode(NewOpc, dl, NVT, Promoted, - DAG.getValueType(VT.getScalarType())); Promoted = DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted); Results.push_back(Promoted); if (IsStrict) Index: llvm/test/CodeGen/X86/avx-cvt-2.ll =================================================================== --- llvm/test/CodeGen/X86/avx-cvt-2.ll +++ llvm/test/CodeGen/X86/avx-cvt-2.ll @@ -12,7 +12,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -26,7 +29,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -40,7 +46,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper @@ -55,7 +64,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovq %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper Index: llvm/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-cvt.ll +++ llvm/test/CodeGen/X86/avx512-cvt.ll @@ -502,14 +502,16 @@ ; NOVL-LABEL: f64to8uc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdb %zmm0, %xmm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8uc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdb %ymm0, %xmm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> @@ -645,14 +647,16 @@ ; NOVL-LABEL: f64to8sc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdb %zmm0, %xmm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8sc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdb %ymm0, %xmm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptosi <8 x double> %f to <8 x i8> @@ -2356,6 +2360,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2364,6 +2369,7 @@ ; VLDQ-LABEL: test_4f64tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2371,6 +2377,7 @@ ; VLNODQ-LABEL: test_4f64tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2379,6 +2386,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2392,6 +2400,7 @@ ; NOVLDQ-LABEL: test_8f64tosb: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: retq @@ -2399,6 +2408,7 @@ ; VLDQ-LABEL: test_8f64tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vpmovd2m %ymm0, %k1 ; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2406,6 +2416,7 @@ ; VLNODQ-LABEL: test_8f64tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2413,6 +2424,7 @@ ; DQNOVL-LABEL: test_8f64tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq @@ -2426,6 +2438,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2435,6 +2448,7 @@ ; VLDQ-LABEL: test_2f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2442,6 +2456,7 @@ ; VLNODQ-LABEL: test_2f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2450,6 +2465,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2465,6 +2481,7 @@ ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2473,6 +2490,7 @@ ; VLDQ-LABEL: test_4f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLDQ-NEXT: vpmovd2m %xmm0, %k1 ; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2480,6 +2498,7 @@ ; VLNODQ-LABEL: test_4f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 ; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2488,6 +2507,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2501,6 +2521,7 @@ ; NOVLDQ-LABEL: test_8f32tosb: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: retq @@ -2508,6 +2529,7 @@ ; VLDQ-LABEL: test_8f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLDQ-NEXT: vpmovd2m %ymm0, %k1 ; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2515,6 +2537,7 @@ ; VLNODQ-LABEL: test_8f32tosb: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 ; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 ; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; VLNODQ-NEXT: retq @@ -2522,6 +2545,7 @@ ; DQNOVL-LABEL: test_8f32tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq @@ -2534,6 +2558,7 @@ ; NODQ-LABEL: test_16f32tosb: ; NODQ: # %bb.0: ; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vpslld $31, %zmm0, %zmm0 ; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; NODQ-NEXT: retq @@ -2541,6 +2566,7 @@ ; VLDQ-LABEL: test_16f32tosb: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; VLDQ-NEXT: vpmovd2m %zmm0, %k1 ; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; VLDQ-NEXT: retq @@ -2548,6 +2574,7 @@ ; DQNOVL-LABEL: test_16f32tosb: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll =================================================================== --- llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -627,8 +627,10 @@ ; CHECK-LABEL: test_16f32tosb_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1 +; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 ; CHECK-NEXT: vpmovd2m %ymm1, %k0 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1 +; CHECK-NEXT: vpslld $31, %ymm1, %ymm1 ; CHECK-NEXT: vpmovd2m %ymm1, %k1 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} @@ -643,6 +645,7 @@ ; CHECK-LABEL: test_16f32tosb_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1 +; CHECK-NEXT: vpslld $31, %zmm1, %zmm1 ; CHECK-NEXT: vpmovd2m %zmm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/tmp/rep.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tmp/rep.ll @@ -0,0 +1,45 @@ +; llc reproducer.ll -mcpu=corei7-avx -O0 -o reproducerr.s + +declare <8 x float> @__ocl_svml_e9_cosf8_native(<8 x float>) local_unnamed_addr + +define <32 x i8> @splat_v3i32(<8 x float>* %ptr1, <8 x float>* %ptr2) #0 { + + %1 = load <8 x float>, <8 x float>* %ptr1, align 4 + %2 = load <8 x float>, <8 x float>* %ptr2, align 4 + + %i375 = fmul <8 x float> %1, + br label %VPlannedBB181vector_func.i7zxzx + +VPlannedBB181vector_func.i7zxzx: ; preds = %VPlannedBB181vector_func.i6zxzx + %i376 = fptoui <8 x float> %i375 to <8 x i8> + %i377 = fmul <8 x float> %2, + %i378 = fadd <8 x float> %i377, + %call.i23.i = call contract <8 x float> @__ocl_svml_e9_cosf8_native(<8 x float> %i378) + %i379 = fadd <8 x float> %call.i23.i, + %i380 = fmul <8 x float> %i379, + %i381 = fmul <8 x float> %i380, + %i382 = fptoui <8 x float> %i381 to <8 x i8> + %i383 = fadd <8 x float> %2, + %call.i24.i = call contract <8 x float> @__ocl_svml_e9_cosf8_native(<8 x float> %i383) + %i384 = fadd <8 x float> %call.i24.i, + %i385 = fmul <8 x float> %i384, + %i386 = fmul <8 x float> %i385, + %i387 = fptoui <8 x float> %i386 to <8 x i8> + + %extended.223vector_func.i = shufflevector <8 x i8> %i376, <8 x i8> undef, <32 x i32> + + %wide.insert224vector_func.i = shufflevector <32 x i8> , <32 x i8> %extended.223vector_func.i, <32 x i32> + + %extended.225vector_func.i = shufflevector <8 x i8> %i382, <8 x i8> undef, <32 x i32> + + %wide.insert226vector_func.i = shufflevector <32 x i8> %wide.insert224vector_func.i, <32 x i8> %extended.225vector_func.i, <32 x i32> + br label %VPlannedBB181vector_func.iend + +VPlannedBB181vector_func.iend: ; preds = %VPlannedBB181vector_func.i7zxzx + %extended.227vector_func.i = shufflevector <8 x i8> %i387, <8 x i8> undef, <32 x i32> + + %wide.insert228vector_func.i = shufflevector <32 x i8> %wide.insert226vector_func.i, <32 x i8> %extended.227vector_func.i, <32 x i32> + ret <32 x i8> %wide.insert228vector_func.i +} + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="none" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "stackrealign" "uniform-work-group-size"="true" "vector-variants"="_ZGVcN8uuuuuuu_mande lbrot_vector" } Index: llvm/test/CodeGen/X86/tmp/rep_old.s =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tmp/rep_old.s @@ -0,0 +1,204 @@ + .text + .file "rep.ll" + .section .rodata.cst32,"aM",@progbits,32 + .p2align 5 # -- Begin function splat_v3i32 +.LCPI0_0: + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 +.LCPI0_1: + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 +.LCPI0_2: + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 +.LCPI0_3: + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 +.LCPI0_4: + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_5: + .byte 128 # 0x80 + .byte 4 # 0x4 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .zero 1 + .byte 128 # 0x80 +.LCPI0_6: + .byte 128 # 0x80 + .byte 0 # 0x0 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .zero 1 + .byte 128 # 0x80 +.LCPI0_7: + .byte 0 # 0x0 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 4 # 0x4 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 8 # 0x8 + .byte 9 # 0x9 + .byte 128 # 0x80 + .byte 11 # 0xb + .byte 12 # 0xc + .byte 13 # 0xd + .byte 128 # 0x80 + .byte 15 # 0xf +.LCPI0_8: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 4 # 0x4 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 128 # 0x80 +.LCPI0_9: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 0 # 0x0 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 128 # 0x80 + .text + .globl splat_v3i32 + .p2align 4, 0x90 + .type splat_v3i32,@function +splat_v3i32: # @splat_v3i32 +# %bb.0: + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $96, %rsp + vmovups (%rdi), %xmm0 + vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 + vmovups (%rsi), %xmm1 + vinsertf128 $1, 16(%rsi), %ymm1, %ymm2 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, 16(%rsp) # 16-byte Spill + vaddps %ymm2, %ymm2, %ymm0 + vaddps .LCPI0_1(%rip), %ymm0, %ymm0 + vmovaps %ymm2, 32(%rsp) # 32-byte Spill + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, (%rsp) # 16-byte Spill + vmovaps 32(%rsp), %ymm0 # 32-byte Reload + vaddps .LCPI0_4(%rip), %ymm0, %ymm0 + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vpackuswb %xmm0, %xmm0, %xmm0 + vpcmpeqd %xmm1, %xmm1, %xmm1 + vmovdqa 16(%rsp), %xmm3 # 16-byte Reload + vpunpcklwd %xmm1, %xmm3, %xmm2 # xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vpshufd $238, %xmm3, %xmm3 # xmm3 = xmm3[2,3,2,3] + vpunpcklwd %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vmovdqa (%rsp), %xmm3 # 16-byte Reload + vpackuswb %xmm3, %xmm3, %xmm3 + vpshufb .LCPI0_5(%rip), %xmm3, %xmm4 # xmm4 = zero,xmm3[4,u],zero,zero,xmm3[5,u],zero,zero,xmm3[6,u],zero,zero,xmm3[7,u],zero + vpor %xmm4, %xmm1, %xmm1 + vpshufb .LCPI0_6(%rip), %xmm3, %xmm3 # xmm3 = zero,xmm3[0,u],zero,zero,xmm3[1,u],zero,zero,xmm3[2,u],zero,zero,xmm3[3,u],zero + vpor %xmm3, %xmm2, %xmm2 + vinsertf128 $1, %xmm1, %ymm2, %ymm1 + vextractf128 $1, %ymm1, %xmm2 + vmovdqa .LCPI0_7(%rip), %xmm3 # xmm3 = [0,1,128,3,4,5,128,7,8,9,128,11,12,13,128,15] + vpshufb %xmm3, %xmm2, %xmm2 + vpshufb .LCPI0_8(%rip), %xmm0, %xmm4 # xmm4 = zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero + vpor %xmm4, %xmm2, %xmm2 + vpshufb %xmm3, %xmm1, %xmm1 + vpshufb .LCPI0_9(%rip), %xmm0, %xmm0 # xmm0 = zero,zero,xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero + vpor %xmm0, %xmm1, %xmm0 + vinsertf128 $1, %xmm2, %ymm0, %ymm0 + movq %rbp, %rsp + popq %rbp + retq +.Lfunc_end0: + .size splat_v3i32, .Lfunc_end0-splat_v3i32 + # -- End function + .section ".note.GNU-stack","",@progbits Index: llvm/test/CodeGen/X86/tmp/rep_rm_simons_rG2a419a0b9957ebac9e11e4b43bc9fbe42a9207df.s =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/tmp/rep_rm_simons_rG2a419a0b9957ebac9e11e4b43bc9fbe42a9207df.s @@ -0,0 +1,224 @@ + .text + .file "rep.ll" + .section .rodata.cst32,"aM",@progbits,32 + .p2align 5 # -- Begin function splat_v3i32 +.LCPI0_0: + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 + .long 0x437f0000 # float 255 +.LCPI0_1: + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 + .long 0x40060aa6 # float 2.09439993 +.LCPI0_2: + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 + .long 0x3f800000 # float 1 +.LCPI0_3: + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 + .long 0x3f000000 # float 0.5 +.LCPI0_4: + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .long 0xc0060aa6 # float -2.09439993 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_5: + .byte 128 # 0x80 + .byte 4 # 0x4 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .zero 1 + .byte 128 # 0x80 +.LCPI0_6: + .byte 0 # 0x0 + .byte 128 # 0x80 + .zero 1 + .byte 3 # 0x3 + .byte 4 # 0x4 + .byte 128 # 0x80 + .zero 1 + .byte 7 # 0x7 + .byte 8 # 0x8 + .byte 128 # 0x80 + .zero 1 + .byte 11 # 0xb + .byte 12 # 0xc + .byte 128 # 0x80 + .zero 1 + .byte 15 # 0xf +.LCPI0_7: + .byte 128 # 0x80 + .byte 0 # 0x0 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .zero 1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .zero 1 + .byte 128 # 0x80 +.LCPI0_8: + .byte 0 # 0x0 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 4 # 0x4 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 8 # 0x8 + .byte 9 # 0x9 + .byte 128 # 0x80 + .byte 11 # 0xb + .byte 12 # 0xc + .byte 13 # 0xd + .byte 128 # 0x80 + .byte 15 # 0xf +.LCPI0_9: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 4 # 0x4 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 5 # 0x5 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 6 # 0x6 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 7 # 0x7 + .byte 128 # 0x80 +.LCPI0_10: + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 0 # 0x0 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 1 # 0x1 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 2 # 0x2 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 128 # 0x80 + .byte 3 # 0x3 + .byte 128 # 0x80 + .text + .globl splat_v3i32 + .p2align 4, 0x90 + .type splat_v3i32,@function +splat_v3i32: # @splat_v3i32 +# %bb.0: + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $96, %rsp + vmovups (%rdi), %xmm0 + vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 + vmovups (%rsi), %xmm1 + vinsertf128 $1, 16(%rsi), %ymm1, %ymm2 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, 16(%rsp) # 16-byte Spill + vaddps %ymm2, %ymm2, %ymm0 + vaddps .LCPI0_1(%rip), %ymm0, %ymm0 + vmovaps %ymm2, 32(%rsp) # 32-byte Spill + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vmovdqa %xmm0, (%rsp) # 16-byte Spill + vmovaps 32(%rsp), %ymm0 # 32-byte Reload + vaddps .LCPI0_4(%rip), %ymm0, %ymm0 + callq __ocl_svml_e9_cosf8_native@PLT + vaddps .LCPI0_2(%rip), %ymm0, %ymm0 + vmulps .LCPI0_3(%rip), %ymm0, %ymm0 + vmulps .LCPI0_0(%rip), %ymm0, %ymm0 + vcvttps2dq %ymm0, %ymm0 + vextractf128 $1, %ymm0, %xmm1 + vpackssdw %xmm1, %xmm0, %xmm0 + vpackuswb %xmm0, %xmm0, %xmm0 + vpcmpeqd %xmm1, %xmm1, %xmm1 + vmovdqa 16(%rsp), %xmm3 # 16-byte Reload + vpunpcklwd %xmm1, %xmm3, %xmm2 # xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vpshufd $238, %xmm3, %xmm3 # xmm3 = xmm3[2,3,2,3] + vpunpcklwd %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] + vmovdqa (%rsp), %xmm3 # 16-byte Reload + vpackuswb %xmm3, %xmm3, %xmm3 + vpshufb .LCPI0_5(%rip), %xmm3, %xmm4 # xmm4 = zero,xmm3[4,u],zero,zero,xmm3[5,u],zero,zero,xmm3[6,u],zero,zero,xmm3[7,u],zero + vmovdqa .LCPI0_6(%rip), %xmm5 # xmm5 = <0,128,u,3,4,128,u,7,8,128,u,11,12,128,u,15> + vpshufb %xmm5, %xmm1, %xmm1 + vpshufb .LCPI0_7(%rip), %xmm3, %xmm3 # xmm3 = zero,xmm3[0,u],zero,zero,xmm3[1,u],zero,zero,xmm3[2,u],zero,zero,xmm3[3,u],zero + vpor %xmm4, %xmm1, %xmm1 + vpshufb %xmm5, %xmm2, %xmm2 + vpor %xmm3, %xmm2, %xmm2 + vinsertf128 $1, %xmm1, %ymm2, %ymm1 + vextractf128 $1, %ymm1, %xmm2 + vmovdqa .LCPI0_8(%rip), %xmm3 # xmm3 = [0,1,128,3,4,5,128,7,8,9,128,11,12,13,128,15] + vpshufb %xmm3, %xmm2, %xmm2 + vpshufb .LCPI0_9(%rip), %xmm0, %xmm4 # xmm4 = zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero + vpor %xmm4, %xmm2, %xmm2 + vpshufb %xmm3, %xmm1, %xmm1 + vpshufb .LCPI0_10(%rip), %xmm0, %xmm0 # xmm0 = zero,zero,xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero + vpor %xmm0, %xmm1, %xmm0 + vinsertf128 $1, %xmm2, %ymm0, %ymm0 + movq %rbp, %rsp + popq %rbp + retq +.Lfunc_end0: + .size splat_v3i32, .Lfunc_end0-splat_v3i32 + # -- End function + .section ".note.GNU-stack","",@progbits Index: llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -3229,6 +3229,7 @@ ; AVX512F-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3238,6 +3239,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -3246,6 +3248,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3255,6 +3258,7 @@ ; AVX512VLDQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vpmovd2m %xmm0, %k0 ; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} Index: llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1280,6 +1280,7 @@ ; AVX512F-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1289,6 +1290,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -1298,6 +1300,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1307,6 +1310,7 @@ ; AVX512DQVL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpmovd2m %xmm0, %k0 ; AVX512DQVL-NEXT: vpmovm2d %k0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1423,7 +1427,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1466,7 +1473,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; @@ -1509,7 +1519,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} @@ -1517,28 +1530,32 @@ ; AVX512F-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512DQVL-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f32(<8 x float> %a, @@ -1551,7 +1568,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} @@ -1559,28 +1579,32 @@ ; AVX512F-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512DQVL-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f32(<8 x float> %a, @@ -1593,13 +1617,17 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -1610,6 +1638,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -1620,6 +1649,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -1630,6 +1660,7 @@ ; AVX512DQVL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmovd2m %ymm0, %k0 ; AVX512DQVL-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 @@ -1645,7 +1676,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vzeroupper ; AVX-NEXT: ret{{[l|q]}} ; Index: llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -678,14 +678,16 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f64(<8 x double> %a, @@ -697,14 +699,16 @@ ; AVX512VL-LABEL: strict_vector_fptoui_v8f64_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v8f64_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f64(<8 x double> %a, @@ -716,6 +720,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -726,6 +731,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -834,6 +840,7 @@ ; AVX512VL-LABEL: strict_vector_fptosi_v16f32_to_v16i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -843,6 +850,7 @@ ; AVX512DQ-LABEL: strict_vector_fptosi_v16f32_to_v16i1: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 Index: llvm/test/CodeGen/X86/vec_cast2.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast2.ll +++ llvm/test/CodeGen/X86/vec_cast2.ll @@ -98,7 +98,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl @@ -111,7 +114,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptosi <8 x float> %src to <8 x i16> @@ -144,7 +150,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl @@ -157,7 +166,10 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = fptoui <8 x float> %src to <8 x i16> Index: llvm/test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2661,10 +2661,16 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2672,11 +2678,12 @@ ; AVX2-LABEL: fptosi_16f32_to_16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2707,10 +2714,16 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2718,11 +2731,12 @@ ; AVX2-LABEL: fptoui_16f32_to_16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq