Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -133,6 +133,9 @@ bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3162,6 +3162,55 @@ return true; } +// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. +// FIXME: Why do we handle this one but not other removed instructions? +// +// Reciprocal square root. The clamp prevents infinite results, clamping +// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to +// +-max_float. +bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(2).getReg(); + auto Flags = MI.getFlags(); + + LLT Ty = MRI.getType(Dst); + + const fltSemantics *FltSemantics; + if (Ty == LLT::scalar(32)) + FltSemantics = &APFloat::IEEEsingle(); + else if (Ty == LLT::scalar(64)) + FltSemantics = &APFloat::IEEEdouble(); + else + return false; + + auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) + .addUse(Src) + .setMIFlags(Flags); + + // We don't need to concern ourselves with the snan handling difference, since + // the rsq quieted (or not) so use the one which will directly select. + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + const bool UseIEEE = MFI->getMode().IEEE; + + auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); + auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : + B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); + + auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); + + if (UseIEEE) + B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); + else + B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -4398,6 +4447,8 @@ return legalizeTrapIntrinsic(MI, MRI, B); case Intrinsic::debugtrap: return legalizeDebugTrapIntrinsic(MI, MRI, B); + case Intrinsic::amdgcn_rsq_clamp: + return legalizeRsqClampIntrinsic(MI, MRI, B); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir @@ -0,0 +1,63 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s + +--- +name: test_rsq_clamp_flags_ieee_on_f32 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32 + ; SI: liveins: $vgpr0 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32) + ; SI: $vgpr0 = COPY [[INT]](s32) + ; VI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32 + ; VI: liveins: $vgpr0 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000 + ; VI: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM_IEEE [[INT]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000 + ; VI: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[C1]] + ; VI: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0 + $vgpr0 = COPY %1 +... + +--- +name: test_rsq_clamp_flags_ieee_off_f32 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32 + ; SI: liveins: $vgpr0 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32) + ; SI: $vgpr0 = COPY [[INT]](s32) + ; VI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32 + ; VI: liveins: $vgpr0 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000 + ; VI: [[FMINNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM [[INT]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000 + ; VI: [[FMAXNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM [[FMINNUM]], [[C1]] + ; VI: $vgpr0 = COPY [[FMAXNUM]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0 + $vgpr0 = COPY %1 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s + +define float @v_rsq_clamp_f32(float %src) #0 { +; SI-LABEL: v_rsq_clamp_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e32 v0, v0 +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) + ret float %rsq_clamp +} + +define float @v_rsq_clamp_fabs_f32(float %src) #0 { +; SI-LABEL: v_rsq_clamp_fabs_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0| +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_fabs_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e64 v0, |v0| +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src) + ret float %rsq_clamp +} + +define double @v_rsq_clamp_f64(double %src) #0 { +; SI-LABEL: v_rsq_clamp_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) + ret double %rsq_clamp +} + +define double @v_rsq_clamp_fabs_f64(double %src) #0 { +; SI-LABEL: v_rsq_clamp_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]| +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_fabs_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]| +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call double @llvm.fabs.f64(double %src) + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src) + ret double %rsq_clamp +} + +define float @v_rsq_clamp_undef_f32() #0 { +; SI-LABEL: v_rsq_clamp_undef_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_undef_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e32 v0, s4 +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) + ret float %rsq_clamp +} + +define double @v_rsq_clamp_undef_f64() #0 { +; SI-LABEL: v_rsq_clamp_undef_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_undef_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef) + ret double %rsq_clamp +} + +define float @v_rsq_clamp_f32_non_ieee(float %src) #2 { +; SI-LABEL: v_rsq_clamp_f32_non_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f32_non_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e32 v0, v0 +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) + ret float %rsq_clamp +} + +define double @v_rsq_clamp_f64_non_ieee(double %src) #2 { +; SI-LABEL: v_rsq_clamp_f64_non_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f64_non_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) + ret double %rsq_clamp +} + + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.amdgcn.rsq.clamp.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 +declare double @llvm.amdgcn.rsq.clamp.f64(double) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "amdgpu-ieee"="false" } +