Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -278,6 +278,8 @@ BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_ex2_approx_f, "ff", "") BUILTIN(__nvvm_ex2_approx_d, "dd", "") +TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "") BUILTIN(__nvvm_lg2_approx_f, "ff", "") Index: clang/test/CodeGen/builtins-nvptx-native-half-type.c =================================================================== --- clang/test/CodeGen/builtins-nvptx-native-half-type.c +++ clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -1,4 +1,9 @@ // REQUIRES: nvptx-registered-target +// +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ // RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ @@ -32,6 +37,16 @@ #define __device__ __attribute__((device)) +__device__ void nvvm_ex2_sm75() { +#if __CUDA_ARCH__ >= 750 + // CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16 + __nvvm_ex2_approx_f16(0.1f16); + // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2 + __nvvm_ex2_approx_f16x2({0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + // CHECK-LABEL: nvvm_min_max_sm80 __device__ void nvvm_min_max_sm80() { #if __CUDA_ARCH__ >= 800 Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -828,6 +828,10 @@ DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_nvvm_ex2_approx_d : GCCBuiltin<"__nvvm_ex2_approx_d">, DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; + def int_nvvm_ex2_approx_f16 : GCCBuiltin<"__nvvm_ex2_approx_f16">, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>; + def int_nvvm_ex2_approx_f16x2 : GCCBuiltin<"__nvvm_ex2_approx_f16x2">, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>; def int_nvvm_lg2_approx_ftz_f : GCCBuiltin<"__nvvm_lg2_approx_ftz_f">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -907,6 +907,10 @@ Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>; def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>; +def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;", + Float16Regs, Float16Regs, int_nvvm_ex2_approx_f16, [hasPTX70, hasSM75]>; +def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;", + Float16x2Regs, Float16x2Regs, int_nvvm_ex2_approx_f16x2, [hasPTX70, hasSM75]>; def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>; Index: llvm/test/CodeGen/NVPTX/f16-ex2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s + +declare half @llvm.nvvm.ex2.approx.f16(half) +declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) + +; CHECK-LABEL: exp2_half +define half @exp2_half(half %0) { + ; CHECK-NOT: call + ; CHECK: ex2.approx.f16 + %res = call half @llvm.nvvm.ex2.approx.f16(half %0); + ret half %res +} + +; CHECK-LABEL: exp2_2xhalf +define <2 x half> @exp2_2xhalf(<2 x half> %0) { + ; CHECK-NOT: call + ; CHECK: ex2.approx.f16x2 + %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0); + ret <2 x half> %res +}