Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -107,13 +107,41 @@ // Min Max -BUILTIN(__nvvm_fmax_ftz_f, "fff", "") -BUILTIN(__nvvm_fmax_f, "fff", "") -BUILTIN(__nvvm_fmin_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_bf16, "UsUsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "UsUsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_bf16x2, "ZUiZUiZUi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80,PTX70)) BUILTIN(__nvvm_fmin_f, "fff", "") +BUILTIN(__nvvm_fmin_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80,PTX70)) +BUILTIN(__nvvm_fmin_d, "ddd", "") +TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_bf16, "UsUsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "UsUsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_bf16x2, "ZUiZUiZUi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80,PTX70)) +BUILTIN(__nvvm_fmax_f, "fff", "") +BUILTIN(__nvvm_fmax_ftz_f, "fff", "") +TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80,PTX70)) BUILTIN(__nvvm_fmax_d, "ddd", "") -BUILTIN(__nvvm_fmin_d, "ddd", "") // Multiplication @@ -827,6 +855,13 @@ TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70)) + +// bf16, bf16x2 abs, neg +TARGET_BUILTIN(__nvvm_abs_bf16, "UsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_abs_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16, "UsUs", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") Index: clang/test/CodeGen/builtins-nvptx-native-half-type.c =================================================================== --- /dev/null +++ clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -0,0 +1,53 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s + +#define __device__ __attribute__((device)) + +// CHECK-LABEL: nvvm_min_max_sm80 +__device__ void nvvm_min_max_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.f16 + __nvvm_fmin_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.f16 + __nvvm_fmin_ftz_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.nan.f16 + __nvvm_fmin_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.nan.f16 + __nvvm_fmin_ftz_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.f16x2 + __nvvm_fmin_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.f16x2 + __nvvm_fmin_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.nan.f16x2 + __nvvm_fmin_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2 + __nvvm_fmin_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.f16 + __nvvm_fmax_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.f16 + __nvvm_fmax_ftz_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.nan.f16 + __nvvm_fmax_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.nan.f16 + __nvvm_fmax_ftz_nan_f16(0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.f16x2 + __nvvm_fmax_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.f16x2 + __nvvm_fmax_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.nan.f16x2 + __nvvm_fmax_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2 + __nvvm_fmax_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} Index: clang/test/CodeGen/builtins-nvptx.c =================================================================== --- clang/test/CodeGen/builtins-nvptx.c +++ clang/test/CodeGen/builtins-nvptx.c @@ -791,3 +791,62 @@ #endif // CHECK: ret void } + +// CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80 +__device__ void nvvm_abs_neg_bf16_bf16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.abs.bf16(i16 -1) + __nvvm_abs_bf16(0xFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.abs.bf16x2(i32 -1) + __nvvm_abs_bf16x2(0xFFFFFFFF); + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.neg.bf16(i16 -1) + __nvvm_neg_bf16(0xFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.neg.bf16x2(i32 -1) + __nvvm_neg_bf16x2(0xFFFFFFFF); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_min_max_sm80 +__device__ void nvvm_min_max_sm80() { +#if __CUDA_ARCH__ >= 800 + + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f + __nvvm_fmin_nan_f(0.1f, (float)0x7FBFFFFF); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f + __nvvm_fmin_ftz_nan_f(0.1f, (float)0x7FBFFFFF); + + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.bf16 + __nvvm_fmin_bf16(0x1234, 0x7FBF); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.nan.bf16 + __nvvm_fmin_nan_bf16(0x1234, 0x7FBF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.bf16x2 + __nvvm_fmin_bf16x2(0x7FBFFFFF, 0xFFFFFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.nan.bf16x2 + __nvvm_fmin_nan_bf16x2(0x7FBFFFFF, 0xFFFFFFFF); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)0x7FBFFFFF); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)0x7FBFFFFF); + + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)0x7FBFFFFF); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)0x7FBFFFFF); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.bf16 + __nvvm_fmax_bf16(0x1234, 0x7FBF); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.nan.bf16 + __nvvm_fmax_nan_bf16(0x1234, 0x7FBF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.bf16x2 + __nvvm_fmax_bf16x2(0x7FBFFFFF, 0xFFFFFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.nan.bf16x2 + __nvvm_fmax_nan_bf16x2(0x7FBFFFFF, 0xFFFFFFFF); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f + __nvvm_fmax_nan_f(0.1f, (float)0x7FBFFFFF); + // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f + __nvvm_fmax_ftz_nan_f(0.1f, (float)0x7FBFFFFF); +#endif + // CHECK: ret void +} Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -564,26 +564,48 @@ // Min Max // - def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], + foreach operation = ["min", "max"] in { + def int_nvvm_f # operation # _d : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty] - , [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + foreach capability = ["_f", "_ftz_f", "_nan_f", "_ftz_nan_f"] in { + def int_nvvm_f # operation # capability : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } - def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + foreach capability = ["_f16", "_ftz_f16", "_nan_f16", "_ftz_nan_f16"] in { + def int_nvvm_f # operation # capability : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + + foreach capability = ["_f16x2", "_ftz_f16x2", "_nan_f16x2", + "_ftz_nan_f16x2"] in { + def int_nvvm_f # operation # capability : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + + foreach capability = ["_bf16", "_nan_bf16"] in { + def int_nvvm_f # operation # capability : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + + foreach capability = ["_bf16x2", "_nan_bf16x2"] in { + def int_nvvm_f # operation # capability : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, Commutative]>; + } + } // // Multiplication @@ -740,6 +762,19 @@ def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">, DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; +// +// Abs, Neg bf16, bf16x2 +// + + foreach unary = ["abs", "neg"] in { + def int_nvvm_ # unary # _bf16 : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>; + def int_nvvm_ # unary # _bf16x2 : + GCCBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + } + // // Round // Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -549,19 +549,22 @@ // We need a full string for OpcStr here because we need to deal with case like // INT_PTX_RECIP. class F_MATH_1 + NVPTXRegClass src_regclass, Intrinsic IntOP, list Preds = []> : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0), OpcStr, - [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>; + [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>, + Requires; // We need a full string for OpcStr here because we need to deal with the case // like INT_PTX_NATIVE_POWR_F. class F_MATH_2 + NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP, + list Preds = []> : NVPTXInst<(outs t_regclass:$dst), (ins s0_regclass:$src0, s1_regclass:$src1), OpcStr, - [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>; + [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>, + Requires; class F_MATH_3; def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>; +def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_f, + [hasPTX70, hasSM80]>; +def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_f, + [hasPTX70, hasSM80]>; def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_f>; def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>; +def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_f, + [hasPTX70, hasSM80]>; +def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32 \t$dst, $src0, $src1;", + Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_f, + [hasPTX70, hasSM80]>; def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_fmin_d>; def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_fmax_d>; +// +// Min Max f16, f16x2, bf16, bf16x2 +// +class MIN_MAX_TUPLE { + string Capacity = C; + Intrinsic Intr = I; + NVPTXRegClass RegClass = RC; +} + +multiclass MIN_MAX { + foreach P = [ + MIN_MAX_TUPLE<"_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_f16, + int_nvvm_fmax_f16), Float16Regs>, + MIN_MAX_TUPLE<"_ftz_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_ftz_f16, + int_nvvm_fmax_ftz_f16), Float16Regs>, + MIN_MAX_TUPLE<"_NaN_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_f16, + int_nvvm_fmax_nan_f16), Float16Regs>, + MIN_MAX_TUPLE<"_ftz_NaN_f16", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Float16Regs>, + MIN_MAX_TUPLE<"_f16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_f16x2, + int_nvvm_fmax_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_ftz_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_NaN_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_ftz_NaN_f16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Float16x2Regs>, + MIN_MAX_TUPLE<"_bf16", !if(!eq(IntName, "min"), + int_nvvm_fmin_bf16, int_nvvm_fmax_bf16), Int16Regs>, + MIN_MAX_TUPLE<"_NaN_bf16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_bf16, + int_nvvm_fmax_nan_bf16), Int16Regs>, + MIN_MAX_TUPLE<"_bf16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_bf16x2, + int_nvvm_fmax_bf16x2), Int32Regs>, + MIN_MAX_TUPLE<"_NaN_bf16x2", !if(!eq(IntName, "min"), + int_nvvm_fmin_nan_bf16x2, int_nvvm_fmax_nan_bf16x2), Int32Regs>] in { + + def P.Capacity : F_MATH_2; + } +} + +defm INT_NVVM_FMIN : MIN_MAX<"min">; +defm INT_NVVM_FMAN : MIN_MAX<"max">; // // Multiplication @@ -719,6 +778,19 @@ def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_fabs_d>; +// +// Abs, Neg bf16, bf16x2 +// + +def INT_NVVM_ABS_BF16 : F_MATH_1<"abs.bf16 \t$dst, $dst;", Int16Regs, + Int16Regs, int_nvvm_abs_bf16, [hasPTX70, hasSM80]>; +def INT_NVVM_ABS_BF16X2 : F_MATH_1<"abs.bf16x2 \t$dst, $dst;", Int32Regs, + Int32Regs, int_nvvm_abs_bf16x2, [hasPTX70, hasSM80]>; +def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $dst;", Int16Regs, + Int16Regs, int_nvvm_neg_bf16, [hasPTX70, hasSM80]>; +def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $dst;", Int32Regs, + Int32Regs, int_nvvm_neg_bf16x2, [hasPTX70, hasSM80]>; + // // Round // Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -145,11 +145,15 @@ Optional Special; FtzRequirementTy FtzRequirement = FTZ_Any; + // Denormal handling is guarded by different attributes depending on the + // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs. + bool IsHalfTy = false; SimplifyAction() = default; - SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) - : IID(IID), FtzRequirement(FtzReq) {} + SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq, + bool IsHalfTy = false) + : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {} // Cast operations don't have anything to do with FTZ, so we skip that // argument. @@ -197,12 +201,52 @@ return {Intrinsic::maxnum, FTZ_MustBeOff}; case Intrinsic::nvvm_fmax_ftz_f: return {Intrinsic::maxnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_nan_f: + return {Intrinsic::maximum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmax_ftz_nan_f: + return {Intrinsic::maximum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_f16: + return {Intrinsic::maxnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_f16: + return {Intrinsic::maxnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmax_f16x2: + return {Intrinsic::maxnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_f16x2: + return {Intrinsic::maxnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmax_nan_f16: + return {Intrinsic::maximum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_nan_f16: + return {Intrinsic::maximum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmax_nan_f16x2: + return {Intrinsic::maximum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmax_ftz_nan_f16x2: + return {Intrinsic::maximum, FTZ_MustBeOn, true}; case Intrinsic::nvvm_fmin_d: return {Intrinsic::minnum, FTZ_Any}; case Intrinsic::nvvm_fmin_f: return {Intrinsic::minnum, FTZ_MustBeOff}; case Intrinsic::nvvm_fmin_ftz_f: return {Intrinsic::minnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_nan_f: + return {Intrinsic::minimum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmin_ftz_nan_f: + return {Intrinsic::minimum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_f16: + return {Intrinsic::minnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_f16: + return {Intrinsic::minnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmin_f16x2: + return {Intrinsic::minnum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_f16x2: + return {Intrinsic::minnum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmin_nan_f16: + return {Intrinsic::minimum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_nan_f16: + return {Intrinsic::minimum, FTZ_MustBeOn, true}; + case Intrinsic::nvvm_fmin_nan_f16x2: + return {Intrinsic::minimum, FTZ_MustBeOff, true}; + case Intrinsic::nvvm_fmin_ftz_nan_f16x2: + return {Intrinsic::minimum, FTZ_MustBeOn, true}; case Intrinsic::nvvm_round_d: return {Intrinsic::round, FTZ_Any}; case Intrinsic::nvvm_round_f: @@ -316,9 +360,10 @@ // intrinsic, we don't have to look up any module metadata, as // FtzRequirementTy will be FTZ_Any.) if (Action.FtzRequirement != FTZ_Any) { - StringRef Attr = II->getFunction() - ->getFnAttribute("denormal-fp-math-f32") - .getValueAsString(); + const char *AttrName = + Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32"; + StringRef Attr = + II->getFunction()->getFnAttribute(AttrName).getValueAsString(); DenormalMode Mode = parseDenormalFPAttribute(Attr); bool FtzEnabled = Mode.Output != DenormalMode::IEEE; Index: llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll @@ -0,0 +1,268 @@ +; RUN: opt < %s -instcombine -S -mtriple=nvptx-nvidia-cuda -march=nvptx64 \ +; RUN: -mcpu=sm_80 -mattr=+ptx70 | \ +; RUN: FileCheck %s + +declare half @llvm.nvvm.fmin.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>) +declare float @llvm.nvvm.fmin.nan.f(float, float) +declare float @llvm.nvvm.fmin.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmin.nan.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>) + +declare half @llvm.nvvm.fmax.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>) +declare float @llvm.nvvm.fmax.nan.f(float, float) +declare float @llvm.nvvm.fmax.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmax.nan.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>) + +; CHECK-LABEL: fmin_f16 +define half @fmin_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.f16 + ; CHECK: @llvm.minnum.f16 + %res = call half @llvm.nvvm.fmin.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_f16 +define half @fmin_ftz_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.f16 + ; CHECK: @llvm.minnum.f16 + %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_f16_no_attr +define half @fmin_ftz_f16_no_attr(half %0, half %1) { + ; CHECK-NOT: @llvm.minnum.f16 + ; CHECK: @llvm.nvvm.fmin.ftz.f16 + %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_f16x2 +define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.f16x2 + ; CHECK: @llvm.minnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_f16x2 +define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.f16x2 + ; CHECK: @llvm.minnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_f16x2_no_attr +define <2 x half> @fmin_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.minnum.v2f16 + ; CHECK: @llvm.nvvm.fmin.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_nan_f +define float @fmin_nan_f(float %0, float %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.nan.f + ; CHECK: @llvm.minimum.f32 + %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_f +define float @fmin_ftz_nan_f(float %0, float %1) #1 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f + ; CHECK: @llvm.minimum.f32 + %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_f_no_attr +define float @fmin_ftz_nan_f_no_attr(float %0, float %1) { + ; CHECK: @llvm.nvvm.fmin.ftz.nan.f + ; CHECK-NOT: @llvm.minimum.f32 + %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmin_nan_f16 +define half @fmin_nan_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.nan.f16 + ; CHECK: @llvm.minimum.f16 + %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16 +define half @fmin_ftz_nan_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f16 + ; CHECK: @llvm.minimum.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16_no_attr +define half @fmin_ftz_nan_f16_no_attr(half %0, half %1) { + ; CHECK: @llvm.nvvm.fmin.ftz.nan.f16 + ; CHECK-NOT: @llvm.minimum.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_nan_f16x2 +define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmin.nan.f16x2 + ; CHECK: @llvm.minimum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16x2 +define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f16x2 + ; CHECK: @llvm.minimum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16x2_no_attr +define <2 x half> @fmin_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.minimum.v2f16 + ; CHECK: @llvm.nvvm.fmin.ftz.nan.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_f16 +define half @fmax_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.f16 + ; CHECK: @llvm.maxnum.f16 + %res = call half @llvm.nvvm.fmax.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_f16 +define half @fmax_ftz_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.f16 + ; CHECK: @llvm.maxnum.f16 + %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_f16_no_attr +define half @fmax_ftz_f16_no_attr(half %0, half %1) { + ; CHECK-NOT: @llvm.maxnum.f16 + ; CHECK: @llvm.nvvm.fmax.ftz.f16 + %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_f16x2 +define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.f16x2 + ; CHECK: @llvm.maxnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_f16x2 +define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.f16x2 + ; CHECK: @llvm.maxnum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_f16x2_no_attr +define <2 x half> @fmax_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.maxnum.v2f16 + ; CHECK: @llvm.nvvm.fmax.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_nan_f +define float @fmax_nan_f(float %0, float %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.nan.f + ; CHECK: @llvm.maximum.f32 + %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_f +define float @fmax_ftz_nan_f(float %0, float %1) #1 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f + ; CHECK: @llvm.maximum.f32 + %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_f_no_attr +define float @fmax_ftz_nan_f_no_attr(float %0, float %1) { + ; CHECK: @llvm.nvvm.fmax.ftz.nan.f + ; CHECK-NOT: @llvm.maximum.f32 + %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1) + ret float %res +} + +; CHECK-LABEL: fmax_nan_f16 +define half @fmax_nan_f16(half %0, half %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.nan.f16 + ; CHECK: @llvm.maximum.f16 + %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16 +define half @fmax_ftz_nan_f16(half %0, half %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f16 + ; CHECK: @llvm.maximum.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16_no_attr +define half @fmax_ftz_nan_f16_no_attr(half %0, half %1) { + ; CHECK: @llvm.nvvm.fmax.ftz.nan.f16 + ; CHECK-NOT: @llvm.maximum.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_nan_f16x2 +define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.nvvm.fmax.nan.f16x2 + ; CHECK: @llvm.maximum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16x2 +define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) #0 { + ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f16x2 + ; CHECK: @llvm.maximum.v2f16 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16x2_no_attr +define <2 x half> @fmax_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) { + ; CHECK-NOT: @llvm.maximum.v2f16 + ; CHECK: @llvm.nvvm.fmax.ftz.nan.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +attributes #0 = { "denormal-fp-math"="preserve-sign" } +attributes #1 = { "denormal-fp-math-f32"="preserve-sign" } Index: llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll @@ -0,0 +1,260 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s + +declare i16 @llvm.nvvm.abs.bf16(i16) +declare i32 @llvm.nvvm.abs.bf16x2(i32) +declare i16 @llvm.nvvm.neg.bf16(i16) +declare i32 @llvm.nvvm.neg.bf16x2(i32) + +declare float @llvm.nvvm.fmin.nan.f(float, float) +declare float @llvm.nvvm.fmin.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmin.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.f16(half, half) +declare half @llvm.nvvm.fmin.nan.f16(half, half) +declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>) +declare i16 @llvm.nvvm.fmin.bf16(i16, i16) +declare i16 @llvm.nvvm.fmin.nan.bf16(i16, i16) +declare i32 @llvm.nvvm.fmin.bf16x2(i32, i32) +declare i32 @llvm.nvvm.fmin.nan.bf16x2(i32, i32) + +declare float @llvm.nvvm.fmax.nan.f(float, float) +declare float @llvm.nvvm.fmax.ftz.nan.f(float, float) +declare half @llvm.nvvm.fmax.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.f16(half, half) +declare half @llvm.nvvm.fmax.nan.f16(half, half) +declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half) +declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>) +declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>) +declare i16 @llvm.nvvm.fmax.bf16(i16, i16) +declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16) +declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32) +declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32) + +; CHECK-LABEL: abs_bf16 +define i16 @abs_bf16(i16 %0) { + ; CHECK: abs.bf16 + %res = call i16 @llvm.nvvm.abs.bf16(i16 %0); + ret i16 %res +} + +; CHECK-LABEL: abs_bf16x2 +define i32 @abs_bf16x2(i32 %0) { + ; CHECK: abs.bf16x2 + %res = call i32 @llvm.nvvm.abs.bf16x2(i32 %0); + ret i32 %res +} + +; CHECK-LABEL: neg_bf16 +define i16 @neg_bf16(i16 %0) { + ; CHECK: neg.bf16 + %res = call i16 @llvm.nvvm.neg.bf16(i16 %0); + ret i16 %res +} + +; CHECK-LABEL: neg_bf16x2 +define i32 @neg_bf16x2(i32 %0) { + ; CHECK: neg.bf16x2 + %res = call i32 @llvm.nvvm.neg.bf16x2(i32 %0); + ret i32 %res +} + +; CHECK-LABEL: fmin_nan_f +define float @fmin_nan_f(float %0, float %1) { + ; CHECK: min.NaN.f32 + %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmin_ftz_nan_f +define float @fmin_ftz_nan_f(float %0, float %1) { + ; CHECK: min.ftz.NaN.f32 + %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmin_f16 +define half @fmin_f16(half %0, half %1) { + ; CHECK: min.f16 + %res = call half @llvm.nvvm.fmin.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_f16 +define half @fmin_ftz_f16(half %0, half %1) { + ; CHECK: min.ftz.f16 + %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_nan_f16 +define half @fmin_nan_f16(half %0, half %1) { + ; CHECK: min.NaN.f16 + %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16 +define half @fmin_ftz_nan_f16(half %0, half %1) { + ; CHECK: min.ftz.NaN.f16 + %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmin_f16x2 +define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_f16x2 +define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_nan_f16x2 +define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_ftz_nan_f16x2 +define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: min.ftz.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmin_bf16 +define i16 @fmin_bf16(i16 %0, i16 %1) { + ; CHECK: min.bf16 + %res = call i16 @llvm.nvvm.fmin.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmin_nan_bf16 +define i16 @fmin_nan_bf16(i16 %0, i16 %1) { + ; CHECK: min.NaN.bf16 + %res = call i16 @llvm.nvvm.fmin.nan.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmin_bf16x2 +define i32 @fmin_bf16x2(i32 %0, i32 %1) { + ; CHECK: min.bf16x2 + %res = call i32 @llvm.nvvm.fmin.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmin_nan_bf16x2 +define i32 @fmin_nan_bf16x2(i32 %0, i32 %1) { + ; CHECK: min.NaN.bf16x2 + %res = call i32 @llvm.nvvm.fmin.nan.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmax_nan_f +define float @fmax_nan_f(float %0, float %1) { + ; CHECK: max.NaN.f32 + %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmax_ftz_nan_f +define float @fmax_ftz_nan_f(float %0, float %1) { + ; CHECK: max.ftz.NaN.f32 + %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1); + ret float %res +} + +; CHECK-LABEL: fmax_f16 +define half @fmax_f16(half %0, half %1) { + ; CHECK: max.f16 + %res = call half @llvm.nvvm.fmax.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_f16 +define half @fmax_ftz_f16(half %0, half %1) { + ; CHECK: max.ftz.f16 + %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_nan_f16 +define half @fmax_nan_f16(half %0, half %1) { + ; CHECK: max.NaN.f16 + %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16 +define half @fmax_ftz_nan_f16(half %0, half %1) { + ; CHECK: max.ftz.NaN.f16 + %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1) + ret half %res +} + +; CHECK-LABEL: fmax_f16x2 +define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_f16x2 +define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.ftz.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_nan_f16x2 +define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_ftz_nan_f16x2 +define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) { + ; CHECK: max.ftz.NaN.f16x2 + %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1) + ret <2 x half> %res +} + +; CHECK-LABEL: fmax_bf16 +define i16 @fmax_bf16(i16 %0, i16 %1) { + ; CHECK: max.bf16 + %res = call i16 @llvm.nvvm.fmax.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmax_nan_bf16 +define i16 @fmax_nan_bf16(i16 %0, i16 %1) { + ; CHECK: max.NaN.bf16 + %res = call i16 @llvm.nvvm.fmax.nan.bf16(i16 %0, i16 %1) + ret i16 %res +} + +; CHECK-LABEL: fmax_bf16x2 +define i32 @fmax_bf16x2(i32 %0, i32 %1) { + ; CHECK: max.bf16x2 + %res = call i32 @llvm.nvvm.fmax.bf16x2(i32 %0, i32 %1) + ret i32 %res +} + +; CHECK-LABEL: fmax_nan_bf16x2 +define i32 @fmax_nan_bf16x2(i32 %0, i32 %1) { + ; CHECK: max.NaN.bf16x2 + %res = call i32 @llvm.nvvm.fmax.nan.bf16x2(i32 %0, i32 %1) + ret i32 %res +}