Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -735,8 +735,8 @@ } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { - addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); + addRegisterClass(MVT::v4f32, Subtarget.hasAVX512() ? &X86::VR128XRegClass + : &X86::VR128RegClass); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); @@ -750,19 +750,19 @@ } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { - addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); + addRegisterClass(MVT::v2f64, Subtarget.hasAVX512() ? &X86::VR128XRegClass + : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. - addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); - addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); - addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); - addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); + addRegisterClass(MVT::v16i8, Subtarget.hasAVX512() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + addRegisterClass(MVT::v8i16, Subtarget.hasAVX512() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + addRegisterClass(MVT::v4i32, Subtarget.hasAVX512() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + addRegisterClass(MVT::v2i64, Subtarget.hasAVX512() ? &X86::VR128XRegClass + : &X86::VR128RegClass); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); @@ -971,18 +971,18 @@ if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { bool HasInt256 = Subtarget.hasInt256(); - addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); - addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass - : &X86::VR256RegClass); + addRegisterClass(MVT::v32i8, Subtarget.hasAVX512() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v16i16, Subtarget.hasAVX512() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v8i32, Subtarget.hasAVX512() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, Subtarget.hasAVX512() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, Subtarget.hasAVX512() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, Subtarget.hasAVX512() ? &X86::VR256XRegClass + : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); Index: lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- lib/Target/X86/X86RegisterInfo.cpp +++ lib/Target/X86/X86RegisterInfo.cpp @@ -136,23 +136,14 @@ switch (Super->getID()) { case X86::FR32RegClassID: case X86::FR64RegClassID: - // If AVX-512 isn't supported we should only inflate to these classes. - if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) - return Super; - break; case X86::VR128RegClassID: case X86::VR256RegClassID: - // If VLX isn't supported we should only inflate to these classes. - if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + // If AVX-512 isn't supported we should only inflate to these classes. + if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) return Super; break; case X86::FR32XRegClassID: case X86::FR64XRegClassID: - // If VLX isn't support we shouldn't inflate to these classes. - if (!Subtarget.hasVLX()) - break; - // The VLX check above passed, AVX512 check below will pass. - LLVM_FALLTHROUGH; case X86::VR128XRegClassID: case X86::VR256XRegClassID: // If AVX-512 isn't support we shouldn't inflate to these classes. Index: test/CodeGen/X86/avx512-extended-xmm.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512-extended-xmm.ll @@ -0,0 +1,252 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL + +; This test uses vpmovsxbd and vpmovdb to read and write xmm registers. These should be able to use XMM16-31 even without VLX. + +define <16 x i8> @foo(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> %a3, <16 x i8> %a4, <16 x i8> %a5) nounwind { +; CHECK-LABEL: foo: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovdqa %xmm4, %xmm14 +; CHECK-NEXT: vmovdqa %xmm3, %xmm10 +; CHECK-NEXT: vmovdqa %xmm2, %xmm13 +; CHECK-NEXT: vmovdqa %xmm0, %xmm8 +; CHECK-NEXT: vpaddb %xmm1, %xmm8, %xmm6 +; CHECK-NEXT: vmovdqa %xmm1, %xmm11 +; CHECK-NEXT: vpaddb %xmm10, %xmm13, %xmm2 +; CHECK-NEXT: vpaddb %xmm5, %xmm14, %xmm4 +; CHECK-NEXT: vpmovsxbw %xmm2, %ymm3 +; CHECK-NEXT: vpmovsxbw %xmm6, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm6, %zmm21 +; CHECK-NEXT: vpmullw %ymm3, %ymm0, %ymm7 +; CHECK-NEXT: vmovdqa64 %zmm3, %zmm20 +; CHECK-NEXT: vpmovsxwd %ymm7, %zmm7 +; CHECK-NEXT: vpmovdb %zmm7, %xmm12 +; CHECK-NEXT: vmovdqa %xmm4, %xmm6 +; CHECK-NEXT: vpaddb %xmm12, %xmm6, %xmm1 +; CHECK-NEXT: vpmovsxbw %xmm1, %ymm7 +; CHECK-NEXT: vmovdqa %xmm1, %xmm9 +; CHECK-NEXT: vpmullw %ymm0, %ymm7, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm15 +; CHECK-NEXT: vpaddb %xmm2, %xmm15, %xmm4 +; CHECK-NEXT: vpmovsxbw %xmm4, %ymm2 +; CHECK-NEXT: vpmovsxbw %xmm6, %ymm7 +; CHECK-NEXT: vpmullw %ymm7, %ymm2, %ymm2 +; CHECK-NEXT: vpmovsxwd %ymm2, %zmm2 +; CHECK-NEXT: vpmovdb %zmm2, %xmm3 +; CHECK-NEXT: vpaddb %xmm8, %xmm3, %xmm1 +; CHECK-NEXT: vpmovsxbw %xmm1, %ymm2 +; CHECK-NEXT: vpmovsxbw %xmm11, %ymm7 +; CHECK-NEXT: vmovdqa64 %zmm11, %zmm28 +; CHECK-NEXT: vpmullw %ymm7, %ymm2, %ymm2 +; CHECK-NEXT: vpmovsxwd %ymm2, %zmm2 +; CHECK-NEXT: vpmovsxbw %xmm10, %ymm11 +; CHECK-NEXT: vpmovdb %zmm2, %xmm2 +; CHECK-NEXT: vpaddb %xmm13, %xmm2, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm25 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm17 +; CHECK-NEXT: vpmullw %ymm11, %ymm2, %ymm2 +; CHECK-NEXT: vpmovsxwd %ymm2, %zmm16 +; CHECK-NEXT: vpmovsxbw %xmm5, %ymm10 +; CHECK-NEXT: vpmovdb %zmm16, %xmm5 +; CHECK-NEXT: vpaddb %xmm14, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm5, %zmm19 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm5 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm18 +; CHECK-NEXT: vpmullw %ymm10, %ymm5, %ymm5 +; CHECK-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-NEXT: vpmovdb %zmm5, %xmm5 +; CHECK-NEXT: vpaddb %xmm1, %xmm5, %xmm5 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm22 +; CHECK-NEXT: vpmovsxbw %xmm5, %ymm5 +; CHECK-NEXT: vpmovsxbw %xmm3, %ymm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm5, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm23 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm4, %zmm24 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmovsxbw %xmm15, %ymm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm26 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm9, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm9, %zmm27 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmovsxbw %xmm12, %ymm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm29 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm6, %xmm0, %xmm1 +; CHECK-NEXT: vpmovsxbw %xmm1, %ymm5 +; CHECK-NEXT: vmovdqa64 %zmm20, %zmm12 +; CHECK-NEXT: vpmullw %ymm12, %ymm5, %ymm5 +; CHECK-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-NEXT: vpmovdb %zmm5, %xmm5 +; CHECK-NEXT: vmovdqa64 %zmm21, %zmm2 +; CHECK-NEXT: vpaddb %xmm2, %xmm5, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm9 +; CHECK-NEXT: vpmullw %ymm10, %ymm9, %ymm15 +; CHECK-NEXT: vpmovsxwd %ymm15, %zmm15 +; CHECK-NEXT: vpmovdb %zmm15, %xmm5 +; CHECK-NEXT: vpaddb %xmm14, %xmm5, %xmm5 +; CHECK-NEXT: vpmovsxbw %xmm5, %ymm5 +; CHECK-NEXT: vpmullw %ymm11, %ymm5, %ymm5 +; CHECK-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-NEXT: vpmovdb %zmm5, %xmm5 +; CHECK-NEXT: vpaddb %xmm13, %xmm5, %xmm5 +; CHECK-NEXT: vpmovsxbw %xmm5, %ymm5 +; CHECK-NEXT: vpmullw %ymm7, %ymm5, %ymm5 +; CHECK-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm5, %xmm5 +; CHECK-NEXT: vpaddb %xmm8, %xmm5, %xmm5 +; CHECK-NEXT: vpmovsxbd %xmm5, %zmm7 +; CHECK-NEXT: vmovdqa64 %zmm25, %zmm3 +; CHECK-NEXT: vpmovsxbd %xmm3, %zmm15 +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm16 +; CHECK-NEXT: vpaddd %zmm0, %zmm16, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm15, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm7, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmovsxbw %xmm8, %ymm4 +; CHECK-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm28, %zmm4 +; CHECK-NEXT: vpaddb %xmm13, %xmm4, %xmm4 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmullw %ymm11, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm14, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmullw %ymm10, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmullw %ymm12, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmovsxbw %xmm3, %ymm2 +; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmullw %ymm9, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm6, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm29, %zmm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm27, %zmm1 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm26, %zmm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm24, %zmm1 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm23, %zmm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm22, %zmm1 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm17, %zmm1 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vmovdqa64 %zmm19, %zmm1 +; CHECK-NEXT: vpmovsxbw %xmm1, %ymm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vmovdqa64 %zmm18, %zmm1 +; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = add <16 x i8> %a0, %a1 + %2 = add <16 x i8> %a2, %a3 + %3 = add <16 x i8> %a4, %a5 + %4 = mul <16 x i8> %1, %2 + %5 = add <16 x i8> %3, %4 + %6 = mul <16 x i8> %5, %1 + %7 = add <16 x i8> %6, %2 + %8 = mul <16 x i8> %7, %3 + %9 = add <16 x i8> %8, %a0 + %10 = mul <16 x i8> %9, %a1 + %11 = add <16 x i8> %10, %a2 + %12 = mul <16 x i8> %11, %a3 + %13 = add <16 x i8> %12, %a4 + %14 = mul <16 x i8> %13, %a5 + %15 = add <16 x i8> %14, %9 + %16 = mul <16 x i8> %15, %8 + %17 = add <16 x i8> %16, %7 + %18 = mul <16 x i8> %17, %6 + %19 = add <16 x i8> %18, %5 + %20 = mul <16 x i8> %19, %4 + %21 = add <16 x i8> %20, %3 + %22 = mul <16 x i8> %21, %2 + %23 = add <16 x i8> %22, %1 + %24 = mul <16 x i8> %23, %a5 + %25 = add <16 x i8> %24, %a4 + %26 = mul <16 x i8> %25, %a3 + %27 = add <16 x i8> %26, %a2 + %28 = mul <16 x i8> %27, %a1 + %29 = add <16 x i8> %28, %a0 + %30 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %29, <16 x i32> undef, i16 -1) + %31 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %10, <16 x i32> undef, i16 -1) + %32 = add <16 x i32> %30, %31 + %33 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %21, <16 x i32> undef, i16 -1) + %34 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %23, <16 x i32> undef, i16 -1) + %35 = add <16 x i32> %33, %34 + %36 = add <16 x i32> %32, %35 + %37 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %36, <16 x i8> undef, i16 -1) + %38 = mul <16 x i8> %37, %a0 + %39 = add <16 x i8> %38, %a1 + %40 = add <16 x i8> %39, %a2 + %41 = mul <16 x i8> %40, %a3 + %42 = add <16 x i8> %41, %a4 + %43 = mul <16 x i8> %42, %a5 + %44 = add <16 x i8> %43, %1 + %45 = mul <16 x i8> %44, %2 + %46 = add <16 x i8> %45, %29 + %47 = mul <16 x i8> %46, %10 + %48 = add <16 x i8> %47, %21 + %49 = mul <16 x i8> %48, %23 + %50 = add <16 x i8> %49, %3 + %51 = mul <16 x i8> %50, %4 + %52 = add <16 x i8> %51, %5 + %53 = mul <16 x i8> %52, %6 + %54 = add <16 x i8> %53, %7 + %55 = mul <16 x i8> %54, %8 + %56 = add <16 x i8> %55, %9 + %57 = mul <16 x i8> %56, %10 + %58 = add <16 x i8> %57, %11 + %59 = mul <16 x i8> %58, %12 + %60 = add <16 x i8> %59, %13 + + ret <16 x i8> %60 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16) + Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -3350,69 +3350,69 @@ ; ; AVX512F-LABEL: cvt_16f32_to_16i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm14 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %zmm5, %ymm5 +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm7 +; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm8[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm6, %ymm6 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm8[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %zmm9, %ymm9 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm8[1,1,3,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm10, %ymm10 +; AVX512F-NEXT: vcvtps2ph $4, %zmm8, %ymm8 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm7[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm11, %ymm11 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm12 = xmm7[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %zmm12, %ymm12 +; AVX512F-NEXT: vcvtps2ph $4, %zmm7, %ymm13 +; AVX512F-NEXT: vmovd %xmm13, %eax +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm7, %ymm7 +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vmovd %xmm7, %eax +; AVX512F-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm12, %eax +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm11, %eax +; AVX512F-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm8, %eax +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm10, %eax +; AVX512F-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm9, %eax +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm6, %eax +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm6 +; AVX512F-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm6, %eax +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: vmovd %eax, %xmm6 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $1, %eax, %xmm6, %xmm0 +; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4 +; AVX512F-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm14, %eax ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1 -; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0 -; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX512F-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq ;