diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -107,13 +107,89 @@ // Min Max -BUILTIN(__nvvm_fmax_ftz_f, "fff", "") -BUILTIN(__nvvm_fmax_f, "fff", "") -BUILTIN(__nvvm_fmin_ftz_f, "fff", "") -BUILTIN(__nvvm_fmin_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "UsUsUs", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmin_f, "fff", "") +BUILTIN(__nvvm_fmin_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmin_d, "ddd", "") +TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "UsUsUs", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "", + AND(SM_86, PTX72)) +BUILTIN(__nvvm_fmax_f, "fff", "") +BUILTIN(__nvvm_fmax_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) BUILTIN(__nvvm_fmax_d, "ddd", "") -BUILTIN(__nvvm_fmin_d, "ddd", "") // Multiplication @@ -827,6 +903,13 @@ TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) + +// bf16, bf16x2 abs, neg +TARGET_BUILTIN(__nvvm_abs_bf16, "UsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_abs_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16, "UsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -0,0 +1,103 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_86 -target-feature +ptx72 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s + +#define __device__ __attribute__((device)) + +// CHECK-LABEL: nvvm_min_max_sm80 +__device__ void nvvm_min_max_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.f16 + __nvvm_fmin_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.f16 + __nvvm_fmin_ftz_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.nan.f16 + __nvvm_fmin_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.nan.f16 + __nvvm_fmin_ftz_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.f16x2 + __nvvm_fmin_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.f16x2 + __nvvm_fmin_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.nan.f16x2 + __nvvm_fmin_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2 + __nvvm_fmin_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.f16 + __nvvm_fmax_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.f16 + __nvvm_fmax_ftz_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.nan.f16 + __nvvm_fmax_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.nan.f16 + __nvvm_fmax_ftz_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.f16x2 + __nvvm_fmax_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.f16x2 + __nvvm_fmax_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.nan.f16x2 + __nvvm_fmax_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2 + __nvvm_fmax_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_min_max_sm86 +__device__ void nvvm_min_max_sm86() { +#if __CUDA_ARCH__ >= 860 + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.xorsign.abs.f16 + __nvvm_fmin_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16 + __nvvm_fmin_ftz_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.nan.xorsign.abs.f16 + __nvvm_fmin_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16 + __nvvm_fmin_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2 + __nvvm_fmin_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2 + __nvvm_fmin_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2 + __nvvm_fmin_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2 + __nvvm_fmin_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.xorsign.abs.f16 + __nvvm_fmax_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16 + __nvvm_fmax_ftz_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.nan.xorsign.abs.f16 + __nvvm_fmax_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16 + __nvvm_fmax_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2 + __nvvm_fmax_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2 + __nvvm_fmax_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2 + __nvvm_fmax_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2 + __nvvm_fmax_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -16,6 +16,12 @@ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \ // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -791,3 +797,112 @@ #endif // CHECK: ret void } + +// CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80 +__device__ void nvvm_abs_neg_bf16_bf16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.abs.bf16(i16 -1) + __nvvm_abs_bf16(0xFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.abs.bf16x2(i32 -1) + __nvvm_abs_bf16x2(0xFFFFFFFF); + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.neg.bf16(i16 -1) + __nvvm_neg_bf16(0xFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.neg.bf16x2(i32 -1) + __nvvm_neg_bf16x2(0xFFFFFFFF); +#endif + // CHECK: ret void +} + +#define NAN32 0x7FBFFFFF +#define NAN16 0x7FBF +#define BF16 0x1234 +#define BF16_2 0x4321 +#define NANBF16 0xFFC1 +#define BF16X2 0x12341234 +#define BF16X2_2 0x32343234 +#define NANBF16X2 0xFFC1FFC1 + +// CHECK-LABEL: nvvm_min_max_sm80 +__device__ void nvvm_min_max_sm80() { +#if __CUDA_ARCH__ >= 800 + + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f + __nvvm_fmin_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f + __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32); + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.bf16 + __nvvm_fmin_bf16(BF16, BF16_2); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.nan.bf16 + __nvvm_fmin_nan_bf16(BF16, NANBF16); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.bf16x2 + __nvvm_fmin_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.nan.bf16x2 + __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, 0.11f); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); + + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.bf16 + __nvvm_fmax_bf16(BF16, BF16_2); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.nan.bf16 + __nvvm_fmax_nan_bf16(BF16, NANBF16); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.bf16x2 + __nvvm_fmax_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.nan.bf16x2 + __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)NAN32); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); + +#endif + // CHECK: ret void +} +// CHECK-LABEL: nvvm_min_max_sm86 +__device__ void nvvm_min_max_sm86() { +#if __CUDA_ARCH__ >= 860 + + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.xorsign.abs.bf16 + __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2); + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16 + __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2 + __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2 + __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f + __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f + __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f + __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f + __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32); + + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.xorsign.abs.bf16 + __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2); + // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16 + __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2 + __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2); + // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2 + __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f + __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f + __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f + __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32); + // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f + __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32); +#endif + // CHECK: ret void +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -564,26 +564,55 @@ // Min Max // - def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], + foreach operation = ["min", "max"] in { + def int_nvvm_f # operation # _d : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty] - , [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + foreach variant = ["_f", "_ftz_f", "_nan_f", "_ftz_nan_f", + "_xorsign_abs_f", "_ftz_xorsign_abs_f", "_nan_xorsign_abs_f", + "_ftz_nan_xorsign_abs_f"] in { + def int_nvvm_f # operation # variant : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } - def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + foreach variant = ["_f16", "_ftz_f16", "_nan_f16", "_ftz_nan_f16", + "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16", + "_ftz_nan_xorsign_abs_f16"] in { + def int_nvvm_f # operation # variant : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + + foreach variant = ["_f16x2", "_ftz_f16x2", "_nan_f16x2", + "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2", + "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in { + def int_nvvm_f # operation # variant : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + + foreach variant = ["_bf16", "_nan_bf16", "_xorsign_abs_bf16", + "_nan_xorsign_abs_bf16"] in { + def int_nvvm_f # operation # variant : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + + foreach variant = ["_bf16x2", "_nan_bf16x2", "_xorsign_abs_bf16x2", + "_nan_xorsign_abs_bf16x2"] in { + def int_nvvm_f # operation # variant : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + } // // Multiplication @@ -740,6 +769,19 @@ def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">, DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; +// +// Abs, Neg bf16, bf16x2 +// + + foreach unary = ["abs", "neg"] in { + def int_nvvm_ # unary # _bf16 : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>; + def int_nvvm_ # unary # _bf16x2 : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + } + // // Round // diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -152,12 +152,14 @@ def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">; def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">; def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">; +def hasPTX72 : Predicate<"Subtarget->getPTXVersion() >= 72">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">; +def hasSM86 : Predicate<"Subtarget->getSmVersion() >= 86">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -549,19 +549,22 @@ // We need a full string for OpcStr here because we need to deal with case like // INT_PTX_RECIP. class F_MATH_1 + NVPTXRegClass src_regclass, Intrinsic IntOP, list Preds = []> : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0), OpcStr, - [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>; + [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>, + Requires; // We need a full string for OpcStr here because we need to deal with the case // like INT_PTX_NATIVE_POWR_F. class F_MATH_2 + NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP, + list Preds = []> : NVPTXInst<(outs t_regclass:$dst), (ins s0_regclass:$src0, s1_regclass:$src1), OpcStr, - [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>; + [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>, + Requires; class F_MATH_3; def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>; +def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_f, + [hasPTX70, hasSM80]>; +def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_f, + [hasPTX70, hasSM80]>; +def INT_NVVM_FMIN_XORSIGN_ABS_F : + F_MATH_2<"min.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_xorsign_abs_f, + [hasPTX72, hasSM86]>; +def INT_NVVM_FMIN_FTZ_XORSIGN_ABS_F : + F_MATH_2<"min.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_xorsign_abs_f, + [hasPTX72, hasSM86]>; +def INT_NVVM_FMIN_NAN_XORSIGN_ABS_F : + F_MATH_2<"min.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_xorsign_abs_f, + [hasPTX72, hasSM86]>; +def INT_NVVM_FMIN_FTZ_NAN_XORSIGN_ABS_F : + F_MATH_2<"min.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_xorsign_abs_f, + [hasPTX72, hasSM86]>; def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_f>; def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>; +def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_f, + [hasPTX70, hasSM80]>; +def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_f, + [hasPTX70, hasSM80]>; +def INT_NVVM_FMAX_XORSIGN_ABS_F : + F_MATH_2<"max.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_xorsign_abs_f, + [hasPTX72, hasSM86]>; +def INT_NVVM_FMAX_FTZ_XORSIGN_ABS_F : + F_MATH_2<"max.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_xorsign_abs_f, + [hasPTX72, hasSM86]>; +def INT_NVVM_FMAX_NAN_XORSIGN_ABS_F : + F_MATH_2<"max.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_xorsign_abs_f, + [hasPTX72, hasSM86]>; +def INT_NVVM_FMAX_FTZ_NAN_XORSIGN_ABS_F : + F_MATH_2<"max.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_xorsign_abs_f, + [hasPTX72, hasSM86]>; def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_fmin_d>; def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_fmax_d>; +// +// Min Max f16, f16x2, bf16, bf16x2 +// +class MIN_MAX_TUPLE Preds = [hasPTX70, hasSM80]> { + string Variant = V; + Intrinsic Intr = I; + NVPTXRegClass RegClass = RC; + list Predicates = Preds; +} + +multiclass MIN_MAX { + foreach P = [ + MIN_MAX_TUPLE<"_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_f16, + int_nvvm_fmax_f16), Float16Regs>, + MIN_MAX_TUPLE<"_ftz_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_ftz_f16, + int_nvvm_fmax_ftz_f16), Float16Regs>, + MIN_MAX_TUPLE<"_NaN_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_f16, + int_nvvm_fmax_nan_f16), Float16Regs>, + MIN_MAX_TUPLE<"_ftz_NaN_f16", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Float16Regs>, + MIN_MAX_TUPLE<"_xorsign_abs_f16", !if(!eq(IntName, "min"), + int_nvvm_fmin_xorsign_abs_f16, int_nvvm_fmax_xorsign_abs_f16), + Float16Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_xorsign_abs_f16, int_nvvm_fmax_ftz_xorsign_abs_f16), + Float16Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_xorsign_abs_f16, int_nvvm_fmax_nan_xorsign_abs_f16), + Float16Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_nan_xorsign_abs_f16, + int_nvvm_fmax_ftz_nan_xorsign_abs_f16), Float16Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_f16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_f16x2, + int_nvvm_fmax_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_ftz_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_NaN_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_ftz_NaN_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_xorsign_abs_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_xorsign_abs_f16x2, int_nvvm_fmax_xorsign_abs_f16x2), + Float16x2Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_xorsign_abs_f16x2, int_nvvm_fmax_ftz_xorsign_abs_f16x2), + Float16x2Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_xorsign_abs_f16x2, int_nvvm_fmax_nan_xorsign_abs_f16x2), + Float16x2Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_nan_xorsign_abs_f16x2, + int_nvvm_fmax_ftz_nan_xorsign_abs_f16x2), + Float16x2Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_bf16", !if(!eq(IntName, "min"), + int_nvvm_fmin_bf16, int_nvvm_fmax_bf16), Int16Regs>, + MIN_MAX_TUPLE<"_NaN_bf16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_bf16, + int_nvvm_fmax_nan_bf16), Int16Regs>, + MIN_MAX_TUPLE<"_xorsign_abs_bf16", !if(!eq(IntName, "min"), + int_nvvm_fmin_xorsign_abs_bf16, int_nvvm_fmax_xorsign_abs_bf16), + Int16Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_xorsign_abs_bf16, int_nvvm_fmax_nan_xorsign_abs_bf16), + Int16Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_bf16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_bf16x2, + int_nvvm_fmax_bf16x2), Int32Regs>, + MIN_MAX_TUPLE<"_NaN_bf16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_bf16x2, int_nvvm_fmax_nan_bf16x2), Int32Regs>, + MIN_MAX_TUPLE<"_xorsign_abs_bf16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_xorsign_abs_bf16x2, int_nvvm_fmax_xorsign_abs_bf16x2), + Int32Regs, [hasPTX72, hasSM86]>, + MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_xorsign_abs_bf16x2, + int_nvvm_fmax_nan_xorsign_abs_bf16x2), + Int32Regs, [hasPTX72, hasSM86]>] in { + def P.Variant : F_MATH_2; + } +} + +defm INT_NVVM_FMIN : MIN_MAX<"min">; +defm INT_NVVM_FMAN : MIN_MAX<"max">; // // Multiplication @@ -719,6 +849,19 @@ def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_fabs_d>; +// +// Abs, Neg bf16, bf16x2 +// + +def INT_NVVM_ABS_BF16 : F_MATH_1<"abs.bf16 \t$dst, $dst;", Int16Regs, + Int16Regs, int_nvvm_abs_bf16, [hasPTX70, hasSM80]>; +def INT_NVVM_ABS_BF16X2 : F_MATH_1<"abs.bf16x2 \t$dst, $dst;", Int32Regs, + Int32Regs, int_nvvm_abs_bf16x2, [hasPTX70, hasSM80]>; +def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $dst;", Int16Regs, + Int16Regs, int_nvvm_neg_bf16, [hasPTX70, hasSM80]>; +def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $dst;", Int32Regs, + Int32Regs, int_nvvm_neg_bf16x2, [hasPTX70, hasSM80]>; + // // Round // diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -145,11 +145,15 @@ Optional Special; FtzRequirementTy FtzRequirement = FTZ_Any; + // Denormal handling is guarded by different attributes depending on the + // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs. + bool IsHalfTy = false; SimplifyAction() = default; - SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) - : IID(IID), FtzRequirement(FtzReq) {} + SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq, + bool IsHalfTy = false) + : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {} // Cast operations don't have anything to do with FTZ, so we skip that // argument. @@ -197,12 +201,52 @@ return {Intrinsic::maxnum, FTZ_MustBeOff}; case Intrinsic::nvvm_fmax_ftz_f: return {Intrinsic::maxnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_nan_f: + return {Intrinsic::maximum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmax_ftz_nan_f: + return {Intrinsic::maximum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_f16: + return {Intrinsic::maxnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_f16: + return {Intrinsic::maxnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmax_f16x2: + return {Intrinsic::maxnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_f16x2: + return {Intrinsic::maxnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmax_nan_f16: + return {Intrinsic::maximum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_nan_f16: + return {Intrinsic::maximum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmax_nan_f16x2: + return {Intrinsic::maximum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_nan_f16x2: + return {Intrinsic::maximum, FTZ_MustBeOn, true}; case Intrinsic::nvvm_fmin_d: return {Intrinsic::minnum, FTZ_Any}; case Intrinsic::nvvm_fmin_f: return {Intrinsic::minnum, FTZ_MustBeOff}; case Intrinsic::nvvm_fmin_ftz_f: return {Intrinsic::minnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_nan_f: + return {Intrinsic::minimum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmin_ftz_nan_f: + return {Intrinsic::minimum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_f16: + return {Intrinsic::minnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_f16: + return {Intrinsic::minnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmin_f16x2: + return {Intrinsic::minnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_f16x2: + return {Intrinsic::minnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmin_nan_f16: + return {Intrinsic::minimum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_nan_f16: + return {Intrinsic::minimum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmin_nan_f16x2: + return {Intrinsic::minimum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_nan_f16x2: + return {Intrinsic::minimum, FTZ_MustBeOn, true}; case Intrinsic::nvvm_round_d: return {Intrinsic::round, FTZ_Any}; case Intrinsic::nvvm_round_f: @@ -316,9 +360,10 @@ // intrinsic, we don't have to look up any module metadata, as // FtzRequirementTy will be FTZ_Any.) if (Action.FtzRequirement != FTZ_Any) { - StringRef Attr = II->getFunction() - ->getFnAttribute("denormal-fp-math-f32") - .getValueAsString(); + const char *AttrName = + Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32"; + StringRef Attr = + II->getFunction()->getFnAttribute(AttrName).getValueAsString(); DenormalMode Mode = parseDenormalFPAttribute(Attr); bool FtzEnabled = Mode.Output != DenormalMode::IEEE; diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll @@ -0,0 +1,268 @@ +; RUN: opt < %s -instcombine -S -mtriple=nvptx-nvidia-cuda -march=nvptx64 \ +; RUN: -mcpu=sm_80 -mattr=+ptx70 | \ +; RUN: FileCheck %s + +declare half @llvm.nvvm.fmin.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>) +declare float @llvm.nvvm.fmin.nan.f(float, float) +declare float @llvm.nvvm.fmin.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmin.nan.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>) + +declare half @llvm.nvvm.fmax.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>) +declare float @llvm.nvvm.fmax.nan.f(float, float) +declare float @llvm.nvvm.fmax.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmax.nan.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>) + +; CHECK-LABEL: fmin_f16 +define half @fmin_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.f16 + ; CHECK: @llvm.minnum.f16 + %res = call half @llvm.nvvm.fmin.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_f16 +define half @fmin_ftz_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.f16 + ; CHECK: @llvm.minnum.f16 + %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_f16_no_attr +define half @fmin_ftz_f16_no_attr(half %0, half %1) { + ; CHECK-NOT: @llvm.minnum.f16 + ; CHECK: @llvm.nvvm.fmin.ftz.f16 + %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_f16x2 +define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.f16x2 + ; CHECK: @llvm.minnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_f16x2 +define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.f16x2 + ; CHECK: @llvm.minnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_f16x2_no_attr +define <2 x half> @fmin_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.minnum.v2f16 + ; CHECK: @llvm.nvvm.fmin.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_nan_f +define float @fmin_nan_f(float %0, float %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.nan.f + ; CHECK: @llvm.minimum.f32 + %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_f +define float @fmin_ftz_nan_f(float %0, float %1) #1 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f + ; CHECK: @llvm.minimum.f32 + %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_f_no_attr +define float @fmin_ftz_nan_f_no_attr(float %0, float %1) { + ; CHECK: @llvm.nvvm.fmin.ftz.nan.f + ; CHECK-NOT: @llvm.minimum.f32 + %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_nan_f16 +define half @fmin_nan_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.nan.f16 + ; CHECK: @llvm.minimum.f16 + %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16 +define half @fmin_ftz_nan_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f16 + ; CHECK: @llvm.minimum.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16_no_attr +define half @fmin_ftz_nan_f16_no_attr(half %0, half %1) { + ; CHECK: @llvm.nvvm.fmin.ftz.nan.f16 + ; CHECK-NOT: @llvm.minimum.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_nan_f16x2 +define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.nan.f16x2 + ; CHECK: @llvm.minimum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16x2 +define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f16x2 + ; CHECK: @llvm.minimum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16x2_no_attr +define <2 x half> @fmin_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.minimum.v2f16 + ; CHECK: @llvm.nvvm.fmin.ftz.nan.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_f16 +define half @fmax_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.f16 + ; CHECK: @llvm.maxnum.f16 + %res = call half @llvm.nvvm.fmax.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_f16 +define half @fmax_ftz_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.f16 + ; CHECK: @llvm.maxnum.f16 + %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_f16_no_attr +define half @fmax_ftz_f16_no_attr(half %0, half %1) { + ; CHECK-NOT: @llvm.maxnum.f16 + ; CHECK: @llvm.nvvm.fmax.ftz.f16 + %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_f16x2 +define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.f16x2 + ; CHECK: @llvm.maxnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_f16x2 +define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.f16x2 + ; CHECK: @llvm.maxnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_f16x2_no_attr +define <2 x half> @fmax_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.maxnum.v2f16 + ; CHECK: @llvm.nvvm.fmax.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_nan_f +define float @fmax_nan_f(float %0, float %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.nan.f + ; CHECK: @llvm.maximum.f32 + %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_f +define float @fmax_ftz_nan_f(float %0, float %1) #1 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f + ; CHECK: @llvm.maximum.f32 + %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_f_no_attr +define float @fmax_ftz_nan_f_no_attr(float %0, float %1) { + ; CHECK: @llvm.nvvm.fmax.ftz.nan.f + ; CHECK-NOT: @llvm.maximum.f32 + %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_nan_f16 +define half @fmax_nan_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.nan.f16 + ; CHECK: @llvm.maximum.f16 + %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16 +define half @fmax_ftz_nan_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f16 + ; CHECK: @llvm.maximum.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16_no_attr +define half @fmax_ftz_nan_f16_no_attr(half %0, half %1) { + ; CHECK: @llvm.nvvm.fmax.ftz.nan.f16 + ; CHECK-NOT: @llvm.maximum.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_nan_f16x2 +define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.nan.f16x2 + ; CHECK: @llvm.maximum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16x2 +define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f16x2 + ; CHECK: @llvm.maximum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16x2_no_attr +define <2 x half> @fmax_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.maximum.v2f16 + ; CHECK: @llvm.nvvm.fmax.ftz.nan.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +attributes #0 = { "denormal-fp-math"="preserve-sign" } +attributes #1 = { "denormal-fp-math-f32"="preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll @@ -0,0 +1,260 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s + +declare i16 @llvm.nvvm.abs.bf16(i16) +declare i32 @llvm.nvvm.abs.bf16x2(i32) +declare i16 @llvm.nvvm.neg.bf16(i16) +declare i32 @llvm.nvvm.neg.bf16x2(i32) + +declare float @llvm.nvvm.fmin.nan.f(float, float) +declare float @llvm.nvvm.fmin.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmin.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.f16(half, half) +declare half @llvm.nvvm.fmin.nan.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>) +declare i16 @llvm.nvvm.fmin.bf16(i16, i16) +declare i16 @llvm.nvvm.fmin.nan.bf16(i16, i16) +declare i32 @llvm.nvvm.fmin.bf16x2(i32, i32) +declare i32 @llvm.nvvm.fmin.nan.bf16x2(i32, i32) + +declare float @llvm.nvvm.fmax.nan.f(float, float) +declare float @llvm.nvvm.fmax.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmax.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.f16(half, half) +declare half @llvm.nvvm.fmax.nan.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>) +declare i16 @llvm.nvvm.fmax.bf16(i16, i16) +declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16) +declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32) +declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32) + +; CHECK-LABEL: abs_bf16 +define i16 @abs_bf16(i16 %0) { + ; CHECK: abs.bf16 + %res = call i16 @llvm.nvvm.abs.bf16(i16 %0); + ret i16 %res +} + +; CHECK-LABEL: abs_bf16x2 +define i32 @abs_bf16x2(i32 %0) { + ; CHECK: abs.bf16x2 + %res = call i32 @llvm.nvvm.abs.bf16x2(i32 %0); + ret i32 %res +} + +; CHECK-LABEL: neg_bf16 +define i16 @neg_bf16(i16 %0) { + ; CHECK: neg.bf16 + %res = call i16 @llvm.nvvm.neg.bf16(i16 %0); + ret i16 %res +} + +; CHECK-LABEL: neg_bf16x2 +define i32 @neg_bf16x2(i32 %0) { + ; CHECK: neg.bf16x2 + %res = call i32 @llvm.nvvm.neg.bf16x2(i32 %0); + ret i32 %res +} + +; CHECK-LABEL: fmin_nan_f +define float @fmin_nan_f(float %0, float %1) { + ; CHECK: min.NaN.f32 + %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_f +define float @fmin_ftz_nan_f(float %0, float %1) { + ; CHECK: min.ftz.NaN.f32 + %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmin_f16 +define half @fmin_f16(half %0, half %1) { + ; CHECK: min.f16 + %res = call half @llvm.nvvm.fmin.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_f16 +define half @fmin_ftz_f16(half %0, half %1) { + ; CHECK: min.ftz.f16 + %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_nan_f16 +define half @fmin_nan_f16(half %0, half %1) { + ; CHECK: min.NaN.f16 + %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16 +define half @fmin_ftz_nan_f16(half %0, half %1) { + ; CHECK: min.ftz.NaN.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_f16x2 +define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_f16x2 +define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_nan_f16x2 +define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16x2 +define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.ftz.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_bf16 +define i16 @fmin_bf16(i16 %0, i16 %1) { + ; CHECK: min.bf16 + %res = call i16 @llvm.nvvm.fmin.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmin_nan_bf16 +define i16 @fmin_nan_bf16(i16 %0, i16 %1) { + ; CHECK: min.NaN.bf16 + %res = call i16 @llvm.nvvm.fmin.nan.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmin_bf16x2 +define i32 @fmin_bf16x2(i32 %0, i32 %1) { + ; CHECK: min.bf16x2 + %res = call i32 @llvm.nvvm.fmin.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmin_nan_bf16x2 +define i32 @fmin_nan_bf16x2(i32 %0, i32 %1) { + ; CHECK: min.NaN.bf16x2 + %res = call i32 @llvm.nvvm.fmin.nan.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmax_nan_f +define float @fmax_nan_f(float %0, float %1) { + ; CHECK: max.NaN.f32 + %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_f +define float @fmax_ftz_nan_f(float %0, float %1) { + ; CHECK: max.ftz.NaN.f32 + %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmax_f16 +define half @fmax_f16(half %0, half %1) { + ; CHECK: max.f16 + %res = call half @llvm.nvvm.fmax.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_f16 +define half @fmax_ftz_f16(half %0, half %1) { + ; CHECK: max.ftz.f16 + %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_nan_f16 +define half @fmax_nan_f16(half %0, half %1) { + ; CHECK: max.NaN.f16 + %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16 +define half @fmax_ftz_nan_f16(half %0, half %1) { + ; CHECK: max.ftz.NaN.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_f16x2 +define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_f16x2 +define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_nan_f16x2 +define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16x2 +define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.ftz.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_bf16 +define i16 @fmax_bf16(i16 %0, i16 %1) { + ; CHECK: max.bf16 + %res = call i16 @llvm.nvvm.fmax.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmax_nan_bf16 +define i16 @fmax_nan_bf16(i16 %0, i16 %1) { + ; CHECK: max.NaN.bf16 + %res = call i16 @llvm.nvvm.fmax.nan.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmax_bf16x2 +define i32 @fmax_bf16x2(i32 %0, i32 %1) { + ; CHECK: max.bf16x2 + %res = call i32 @llvm.nvvm.fmax.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmax_nan_bf16x2 +define i32 @fmax_nan_bf16x2(i32 %0, i32 %1) { + ; CHECK: max.NaN.bf16x2 + %res = call i32 @llvm.nvvm.fmax.nan.bf16x2(i32 %0, i32 %1) + ret i32 %res +} diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll @@ -0,0 +1,259 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s + +declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half) +declare half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16, i16) +declare i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16, i16) +declare i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32, i32) +declare i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32, i32) +declare float @llvm.nvvm.fmin.xorsign.abs.f(float, float) +declare float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float, float) +declare float @llvm.nvvm.fmin.nan.xorsign.abs.f(float, float) +declare float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float, float) + +declare half @llvm.nvvm.fmax.xorsign.abs.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half, half) +declare half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>) +declare i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16, i16) +declare i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16, i16) +declare i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32, i32) +declare i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32, i32) +declare float @llvm.nvvm.fmax.xorsign.abs.f(float, float) +declare float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float, float) +declare float @llvm.nvvm.fmax.nan.xorsign.abs.f(float, float) +declare float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float, float) + +; CHECK-LABEL: fmin_xorsign_abs_f16 +define half @fmin_xorsign_abs_f16(half %0, half %1) { + ; CHECK: min.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_xorsign_abs_f16 +define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) { + ; CHECK: min.ftz.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_nan_xorsign_abs_f16 +define half @fmin_nan_xorsign_abs_f16(half %0, half %1) { + ; CHECK: min.NaN.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16 +define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) { + ; CHECK: min.ftz.NaN.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_xorsign_abs_f16x2 +define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_xorsign_abs_f16x2 +define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.ftz.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_nan_xorsign_abs_f16x2 +define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.NaN.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16x2 +define <2 x half> @fmin_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.ftz.NaN.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_xorsign_abs_bf16 +define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) { + ; CHECK: min.xorsign.abs.bf16 + %res = call i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmin_nan_xorsign_abs_bf16 +define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) { + ; CHECK: min.NaN.xorsign.abs.bf16 + %res = call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmin_xorsign_abs_bf16x2 +define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) { + ; CHECK: min.xorsign.abs.bf16x2 + %res = call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmin_nan_xorsign_abs_bf16x2 +define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) { + ; CHECK: min.NaN.xorsign.abs.bf16x2 + %res = call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmin_xorsign_abs_f +define float @fmin_xorsign_abs_f(float %0, float %1) { + ; CHECK: min.xorsign.abs.f + %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_ftz_xorsign_abs_f +define float @fmin_ftz_xorsign_abs_f(float %0, float %1) { + ; CHECK: min.ftz.xorsign.abs.f + %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_nan_xorsign_abs_f +define float @fmin_nan_xorsign_abs_f(float %0, float %1) { + ; CHECK: min.NaN.xorsign.abs.f + %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f +define float @fmin_ftz_nan_xorsign_abs_f(float %0, float %1) { + ; CHECK: min.ftz.NaN.xorsign.abs.f + %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_xorsign_abs_f16 +define half @fmax_xorsign_abs_f16(half %0, half %1) { + ; CHECK: max.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmax.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_xorsign_abs_f16 +define half @fmax_ftz_xorsign_abs_f16(half %0, half %1) { + ; CHECK: max.ftz.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_nan_xorsign_abs_f16 +define half @fmax_nan_xorsign_abs_f16(half %0, half %1) { + ; CHECK: max.NaN.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f16 +define half @fmax_ftz_nan_xorsign_abs_f16(half %0, half %1) { + ; CHECK: max.ftz.NaN.xorsign.abs.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_xorsign_abs_f16x2 +define <2 x half> @fmax_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_xorsign_abs_f16x2 +define <2 x half> @fmax_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.ftz.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_nan_xorsign_abs_f16x2 +define <2 x half> @fmax_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.NaN.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f16x2 +define <2 x half> @fmax_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.ftz.NaN.xorsign.abs.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_xorsign_abs_bf16 +define i16 @fmax_xorsign_abs_bf16(i16 %0, i16 %1) { + ; CHECK: max.xorsign.abs.bf16 + %res = call i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmax_nan_xorsign_abs_bf16 +define i16 @fmax_nan_xorsign_abs_bf16(i16 %0, i16 %1) { + ; CHECK: max.NaN.xorsign.abs.bf16 + %res = call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmax_xorsign_abs_bf16x2 +define i32 @fmax_xorsign_abs_bf16x2(i32 %0, i32 %1) { + ; CHECK: max.xorsign.abs.bf16x2 + %res = call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmax_nan_xorsign_abs_bf16x2 +define i32 @fmax_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) { + ; CHECK: max.NaN.xorsign.abs.bf16x2 + %res = call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmax_xorsign_abs_f +define float @fmax_xorsign_abs_f(float %0, float %1) { + ; CHECK: max.xorsign.abs.f + %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_ftz_xorsign_abs_f +define float @fmax_ftz_xorsign_abs_f(float %0, float %1) { + ; CHECK: max.ftz.xorsign.abs.f + %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_nan_xorsign_abs_f +define float @fmax_nan_xorsign_abs_f(float %0, float %1) { + ; CHECK: max.NaN.xorsign.abs.f + %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f +define float @fmax_ftz_nan_xorsign_abs_f(float %0, float %1) { + ; CHECK: max.ftz.NaN.xorsign.abs.f + %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float %0, float %1) + ret float %res +}