Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -101,6 +101,7 @@ BUILTIN(__builtin_amdgcn_sinf, "ff", "nc") BUILTIN(__builtin_amdgcn_cosf, "ff", "nc") BUILTIN(__builtin_amdgcn_logf, "ff", "nc") +BUILTIN(__builtin_amdgcn_exp2f, "ff", "nc") BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc") BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc") BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc") Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -17173,6 +17173,8 @@ return EmitAMDGPUDispatchPtr(*this, E); case AMDGPU::BI__builtin_amdgcn_logf: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log); + case AMDGPU::BI__builtin_amdgcn_exp2f: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2); case AMDGPU::BI__builtin_amdgcn_log_clampf: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp); case AMDGPU::BI__builtin_amdgcn_ldexp: Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -179,6 +179,13 @@ *out = __builtin_amdgcn_logf(a); } +// CHECK-LABEL: @test_exp2_f32 +// CHECK: call float @llvm.amdgcn.exp2.f32 +void test_exp2_f32(global float* out, float a) +{ + *out = __builtin_amdgcn_exp2f(a); +} + // CHECK-LABEL: @test_log_clamp_f32 // CHECK: call float @llvm.amdgcn.log.clamp.f32 void test_log_clamp_f32(global float* out, float a) Index: llvm/docs/AMDGPUUsage.rst =================================================================== --- llvm/docs/AMDGPUUsage.rst +++ llvm/docs/AMDGPUUsage.rst @@ -972,6 +972,9 @@ llvm.amdgcn.log Provides direct access to v_log_f32 and v_log_f16 (on targets with half support). Peforms log2 function. + llvm.amdgcn.exp2 Provides direct access to v_exp_f32 and v_exp_f16 + (on targets with half support). Performs exp2 function. + ========================================= ========================================================== Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -308,6 +308,15 @@ [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; +// v_exp_{f16|f32} (int_amdgcn_exp was taken by export +// already). Performs exp2. f32 version does not handle +// denormals. There is no reason to use this for f16 as it does +// support denormals, and the generic exp2 intrinsic should be +// preferred. +def int_amdgcn_exp2 : DefaultAttrsIntrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] +>; + def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -459,6 +459,9 @@ // log2, no denormal handling for f32. LOG, + // exp2, no denormal handling for f32. + EXP, + FMUL_LEGACY, RSQ_CLAMP, FP_CLASS, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4932,6 +4932,7 @@ NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(LOG) + NODE_NAME_CASE(EXP) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(FP_CLASS) Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -118,6 +118,9 @@ // v_log_f32, which is log2 def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; +// v_exp_f32, which is exp2 +def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>; + // out = 1.0 / sqrt(a) def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; @@ -393,6 +396,11 @@ def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), (flog2 node:$src)]>; +def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), + (AMDGPUexp_impl node:$src), + (fexp2 node:$src)]>; // FIXME: Remove me +def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), + (fexp2 node:$src)]>; def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_class node:$src0, node:$src1), Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4208,6 +4208,7 @@ case Intrinsic::amdgcn_cos: case Intrinsic::amdgcn_log_clamp: case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_sqrt: Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -321,7 +321,7 @@ defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; let TRANS = 1, SchedRW = [WriteTrans32] in { -defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; +defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, AMDGPUexp>; defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; @@ -488,7 +488,7 @@ defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>; -defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } // End TRANS = 1, SchedRW = [WriteTrans32] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define float @v_exp2_f32(float %src) { +; GCN-LABEL: v_exp2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %exp2 = call float @llvm.amdgcn.exp2.f32(float %src) + ret float %exp2 +} + +define float @v_fabs_exp2_f32(float %src) { +; GCN-LABEL: v_fabs_exp2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %exp2 = call float @llvm.amdgcn.exp2.f32(float %fabs.src) + ret float %exp2 +} + +define float @v_fneg_fabs_exp2_f32(float %src) { +; GCN-LABEL: v_fneg_fabs_exp2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %neg.fabs.src = fneg float %fabs.src + %exp2 = call float @llvm.amdgcn.exp2.f32(float %neg.fabs.src) + ret float %exp2 +} + +define half @v_exp2_f16(half %src) { +; GCN-LABEL: v_exp2_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f16_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %exp2 = call half @llvm.amdgcn.exp2.f16(half %src) + ret half %exp2 +} + +define half @v_fabs_exp2_f16(half %src) { +; GCN-LABEL: v_fabs_exp2_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f16_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call half @llvm.fabs.f16(half %src) + %exp2 = call half @llvm.amdgcn.exp2.f16(half %fabs.src) + ret half %exp2 +} + +define half @v_fneg_fabs_exp2_f16(half %src) { +; GCN-LABEL: v_fneg_fabs_exp2_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f16_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call half @llvm.fabs.f16(half %src) + %neg.fabs.src = fneg half %fabs.src + %exp2 = call half @llvm.amdgcn.exp2.f16(half %neg.fabs.src) + ret half %exp2 +} + +declare half @llvm.amdgcn.exp2.f16(half) #0 +declare float @llvm.amdgcn.exp2.f32(float) #0 +declare float @llvm.fabs.f32(float) #0 +declare half @llvm.fabs.f16(half) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}}