diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -564,9 +564,13 @@ setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); } - // There's no neg.f16 instruction. Expand to (0-x). - setOperationAction(ISD::FNEG, MVT::f16, Expand); - setOperationAction(ISD::FNEG, MVT::v2f16, Expand); + // f16/f16x2 neg was introduced in PTX 60, SM_53. + const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && + STI.getPTXVersion() >= 60 && + STI.allowFP16Math(); + for (const auto &VT : {MVT::f16, MVT::v2f16}) + setOperationAction(ISD::FNEG, VT, + IsFP16FP16x2NegAvailable ? Legal : Expand); // (would be) Library functions. diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -921,6 +921,19 @@ defm FNEG : F2<"neg", fneg>; defm FSQRT : F2<"sqrt.rn", fsqrt>; +// +// F16 NEG +// +class FNEG_F16_F16X2 : + NVPTXInst<(outs RC:$dst), (ins RC:$src), + !strconcat(OpcStr, " \t$dst, $src;"), + [(set RC:$dst, (fneg RC:$src))]>, + Requires<[useFP16Math, hasPTX60, hasSM53, Pred]>; +def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", Float16Regs, doF32FTZ>; +def FNEG16 : FNEG_F16_F16X2<"neg.f16", Float16Regs, True>; +def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", Float16x2Regs, doF32FTZ>; +def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", Float16x2Regs, True>; + // // F64 division // diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -1,27 +1,29 @@ ; ## Full FP16 support enabled by default. ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-F16-NOFTZ %s ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: -mattr=+ptx60 \ ; RUN: | %ptxas-verify -arch=sm_53 \ ; RUN: %} ; ## Full FP16 with FTZ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: -denormal-fp-math-f32=preserve-sign \ +; RUN: -denormal-fp-math-f32=preserve-sign -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16-FTZ %s ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: -denormal-fp-math-f32=preserve-sign \ +; RUN: -denormal-fp-math-f32=preserve-sign -mattr=+ptx60 \ ; RUN: | %ptxas-verify -arch=sm_53 \ ; RUN: %} ; ## FP16 support explicitly disabled. ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ -; RUN: -verify-machineinstrs \ +; RUN: -verify-machineinstrs -mattr=+ptx60 \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOFTZ,CHECK-NOF16 %s ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ @@ -1168,5 +1170,24 @@ ret half %r } +; CHECK-LABEL: test_neg_f16( +; CHECK-F16-NOFTZ: neg.f16 +; CHECK-F16-FTZ: neg.ftz.f16 +; CHECK-NOF16: xor.b16 %rs{{.*}}, %rs{{.*}}, -32768 +define half @test_neg_f16(half noundef %arg) #0 { + %res = fneg half %arg + ret half %res +} + +; CHECK-LABEL: test_neg_f16x2( +; CHECK-F16-NOFTZ: neg.f16x2 +; CHECK-F16-FTZ: neg.ftz.f16x2 +; CHECK-NOF16: xor.b16 %rs{{.*}}, %rs{{.*}}, -32768 +; CHECK-NOF16: xor.b16 %rs{{.*}}, %rs{{.*}}, -32768 +define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 { + %res = fneg <2 x half> %arg + ret <2 x half> %res +} + attributes #0 = { nounwind } attributes #1 = { "unsafe-fp-math" = "true" }