Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16974,6 +16974,7 @@ case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: + case ISD::FP_TO_FP16: // We can't use ISD::VSELECT here because it is not always "Legal" // for the destination type. For example vpmovqb require only AVX512 // and vselect that can operate on byte element type require BWI Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -248,12 +248,12 @@ string AttSrcAsm, string IntelSrcAsm, dag RHS, InstrItinClass itin = NoItinerary, - bit IsCommutable = 0> : + bit IsCommutable = 0, SDNode Select = vselect> : AVX512_maskable_common; // This multiclass generates the unconditional/non-masking, the masking and @@ -5807,11 +5807,12 @@ multiclass avx512_cvtps2ph { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), - (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, $src1", "$src1, $src2", + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), - (i32 FROUND_CURRENT))>, AVX512AIi8Base; + (i32 FROUND_CURRENT)), + NoItinerary, 0, X86select>, AVX512AIi8Base; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), @@ -5827,11 +5828,12 @@ } multiclass avx512_cvtps2ph_sae { defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), - (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; + (i32 FROUND_NO_EXC)), + NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base; } let Predicates = [HasAVX512] in { defm VCVTPS2PHZ : avx512_cvtps2ph, Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -604,13 +604,20 @@ declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly - -define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) { +define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) { ; CHECK-LABEL: test_x86_vcvtps2ph_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm0 -; CHECK-NEXT: retq - %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) + %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask) + store <16 x i16> %res1, <16 x i16> * %dst + %res = add <16 x i16> %res2, %res3 ret <16 x i16> %res } Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -6929,24 +6929,41 @@ declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly -define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) { +define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) { ; CHECK-LABEL: test_x86_vcvtps2ph_128: ; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] +; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] ; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1d,0xc0,0x02] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) + %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask) + %res0 = add <8 x i16> %res1, %res2 + %res = add <8 x i16> %res3, %res0 ret <8 x i16> %res } - declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly -define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) { +define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) { ; CHECK-LABEL: test_x86_vcvtps2ph_256: ; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] +; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] ; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1d,0xc0,0x02] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2] +; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) + %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask) + %res0 = add <8 x i16> %res1, %res2 + %res = add <8 x i16> %res3, %res0 ret <8 x i16> %res }