Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1653,6 +1653,14 @@ addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v32f16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); @@ -1716,14 +1724,6 @@ if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { bool HasBWI = Subtarget.hasBWI(); - addRegisterClass(MVT::v16i32, &X86::VR512RegClass); - addRegisterClass(MVT::v16f32, &X86::VR512RegClass); - addRegisterClass(MVT::v8i64, &X86::VR512RegClass); - addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::v32i16, &X86::VR512RegClass); - addRegisterClass(MVT::v32f16, &X86::VR512RegClass); - addRegisterClass(MVT::v64i8, &X86::VR512RegClass); - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); Index: llvm/test/CodeGen/X86/avx512fp16-cvt.ll =================================================================== --- llvm/test/CodeGen/X86/avx512fp16-cvt.ll +++ llvm/test/CodeGen/X86/avx512fp16-cvt.ll @@ -1029,25 +1029,165 @@ } define <8 x half> @s64tof16(<8 x i64> %a) #0 { -; CHECK-LABEL: s64tof16: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvtqq2ph %ymm1, %xmm1 -; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: ret{{[l|q]}} +; X64-LABEL: s64tof16: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbp, -16 +; X64-NEXT: movq %rsp, %rbp +; X64-NEXT: .cfi_def_cfa_register %rbp +; X64-NEXT: andq $-64, %rsp +; X64-NEXT: subq $128, %rsp +; X64-NEXT: vmovaps %zmm0, (%rsp) +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm1, %xmm0 +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm2, %xmm1 +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm3, %xmm1 +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm3, %xmm2 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vcvtsi2shq {{[0-9]+}}(%rsp), %xmm3, %xmm2 +; X64-NEXT: vcvtsi2shq (%rsp), %xmm3, %xmm3 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-NEXT: movq %rbp, %rsp +; X64-NEXT: popq %rbp +; X64-NEXT: .cfi_def_cfa %rsp, 8 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: s64tof16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $128, %esp +; X86-NEXT: vmovaps %zmm0, (%esp) +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vcvtqq2ph %xmm0, %xmm0 +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: vcvtqq2ph %xmm1, %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: vcvtqq2ph %xmm1, %xmm1 +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; X86-NEXT: vcvtqq2ph %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: vcvtqq2ph %xmm1, %xmm1 +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; X86-NEXT: vcvtqq2ph %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; X86-NEXT: vcvtqq2ph %xmm2, %xmm2 +; X86-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; X86-NEXT: vcvtqq2ph %xmm3, %xmm3 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: vzeroupper +; X86-NEXT: retl %1 = sitofp <8 x i64> %a to <8 x half> ret <8 x half> %1 } define <8 x half> @u64tof16(<8 x i64> %a) #0 { -; CHECK-LABEL: u64tof16: -; CHECK: # %bb.0: -; CHECK-NEXT: vcvtuqq2ph %ymm1, %xmm1 -; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: ret{{[l|q]}} +; X64-LABEL: u64tof16: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbp, -16 +; X64-NEXT: movq %rsp, %rbp +; X64-NEXT: .cfi_def_cfa_register %rbp +; X64-NEXT: andq $-64, %rsp +; X64-NEXT: subq $128, %rsp +; X64-NEXT: vmovaps %zmm0, (%rsp) +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm1, %xmm0 +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm2, %xmm1 +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm3, %xmm1 +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm3, %xmm2 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vcvtusi2shq {{[0-9]+}}(%rsp), %xmm3, %xmm2 +; X64-NEXT: vcvtusi2shq (%rsp), %xmm3, %xmm3 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-NEXT: movq %rbp, %rsp +; X64-NEXT: popq %rbp +; X64-NEXT: .cfi_def_cfa %rsp, 8 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: u64tof16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $128, %esp +; X86-NEXT: vmovaps %zmm0, (%esp) +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: vcvtuqq2ph %xmm1, %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: vcvtuqq2ph %xmm1, %xmm1 +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; X86-NEXT: vcvtuqq2ph %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: vcvtuqq2ph %xmm1, %xmm1 +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; X86-NEXT: vcvtuqq2ph %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; X86-NEXT: vcvtuqq2ph %xmm2, %xmm2 +; X86-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; X86-NEXT: vcvtuqq2ph %xmm3, %xmm3 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: vzeroupper +; X86-NEXT: retl %1 = uitofp <8 x i64> %a to <8 x half> ret <8 x half> %1 } Index: llvm/test/CodeGen/X86/pr59800.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/pr59800.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +;RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc19.20.0" + +; Function Attrs: nounwind uwtable +define void @foo() #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .seh_stackalloc 24 +; CHECK-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; CHECK-NEXT: .seh_savexmm %xmm6, 0 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: #APP +; CHECK-NEXT: vpxord %zmm6, %zmm6, %zmm6 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; CHECK-NEXT: .seh_endproc + tail call void asm sideeffect "vpxord %zmm6, %zmm6, %zmm6\0A", "~{zmm6},~{dirflag},~{fpsr},~{flags}"() #1 + ret void +} + +attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } +attributes #1 = { nounwind } +