Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1280,6 +1280,30 @@ return buildInstr(TargetOpcode::G_FMUL, {Dst}, {Src0, Src1}, Flags); } + MachineInstrBuilder buildFMinNum(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMINNUM, {Dst}, {Src0, Src1}, Flags); + } + + MachineInstrBuilder buildFMaxNum(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMAXNUM, {Dst}, {Src0, Src1}, Flags); + } + + MachineInstrBuilder buildFMinNumIEEE(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMINNUM_IEEE, {Dst}, {Src0, Src1}, Flags); + } + + MachineInstrBuilder buildFMaxNumIEEE(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMAXNUM_IEEE, {Dst}, {Src0, Src1}, Flags); + } + MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional Flags = None) { Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -75,6 +75,9 @@ bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + Register getLiveInRegister(MachineRegisterInfo &MRI, Register Reg, LLT Ty) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -400,10 +400,24 @@ .scalarize(0) .clampScalar(0, S16, S64); } else { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + + getActionDefinitionsBuilder(G_FSQRT) .legalFor({S32, S64}) .scalarize(0) .clampScalar(0, S32, S64); + + if (ST.hasFractBug()) { + getActionDefinitionsBuilder(G_FFLOOR) + .customFor({S64}) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } else { + getActionDefinitionsBuilder(G_FFLOOR) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } } getActionDefinitionsBuilder(G_FPTRUNC) @@ -1165,6 +1179,8 @@ return legalizeFDIV(MI, MRI, B); case TargetOpcode::G_ATOMIC_CMPXCHG: return legalizeAtomicCmpXChg(MI, MRI, B); + case TargetOpcode::G_FFLOOR: + return legalizeFFloor(MI, MRI, B); default: return false; } @@ -1798,6 +1814,75 @@ return true; } +// Find a source register, ignoring any possible source modifiers. +static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { + Register ModSrc = OrigSrc; + if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { + ModSrc = SrcFNeg->getOperand(1).getReg(); + if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + return ModSrc; +} + +bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + const LLT S1 = LLT::scalar(1); + const LLT S64 = LLT::scalar(64); + Register Dst = MI.getOperand(0).getReg(); + Register OrigSrc = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && + "this should not have been custom lowered"); + + // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) + // is used instead. However, SI doesn't have V_FLOOR_F64, so the most + // efficient way to implement it is using V_FRACT_F64. The workaround for the + // V_FRACT bug is: + // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + // + // Convert floor(x) to (x - fract(x)) + + auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) + .addUse(OrigSrc) + .setMIFlags(Flags); + + // Give source modifier matching some assistance before obscuring a foldable + // pattern. + + // TODO: We can avoid the neg on the fract? The input sign to fract + // shouldn't matter? + Register ModSrc = stripAnySourceMods(OrigSrc, MRI); + + auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); + + Register Min = MRI.createGenericVirtualRegister(S64); + + // We don't need to concern ourselves with the snan handling difference, so + // use the one which will directly select. + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + if (MFI->getMode().IEEE) + B.buildFMinNumIEEE(Min, Fract, Const, Flags); + else + B.buildFMinNum(Min, Fract, Const, Flags); + + Register CorrectedFract = Min; + if (!MI.getFlag(MachineInstr::FmNoNans)) { + auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); + CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); + } + + auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); + B.buildFAdd(Dst, OrigSrc, NegFract, Flags); + + MI.eraseFromParent(); + return true; +} + // Return the use branch instruction, otherwise null if the usage is invalid. static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -506,6 +506,10 @@ return getGeneration() >= VOLCANIC_ISLANDS; } + bool hasFractBug() const { + return getGeneration() == SOUTHERN_ISLANDS; + } + bool hasBFE() const { return true; } Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1986,6 +1986,11 @@ // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) // Convert floor(x) to (x - fract(x)) + +// Don't bother handling this for GlobalISel, it's handled during +// lowering. +// +// FIXME: DAG should also custom lower this. def : GCNPat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), (V_ADD_F64 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll @@ -0,0 +1,298 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefix=GFX78 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX78 %s + +define double @v_floor_f64_ieee(double %x) { +; GFX6-LABEL: v_floor_f64_ieee: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_ieee: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_ieee_nnan(double %x) { +; GFX6-LABEL: v_floor_f64_ieee_nnan: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_ieee_nnan: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call nnan double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_ieee_fneg(double %x) { +; GFX6-LABEL: v_floor_f64_ieee_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_ieee_fneg: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %result = call double @llvm.floor.f64(double %neg.x) + ret double %result +} + +define double @v_floor_f64_nonieee(double %x) #1 { +; GFX6-LABEL: v_floor_f64_nonieee: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_nonieee: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_nonieee_nnan(double %x) #1 { +; GFX6-LABEL: v_floor_f64_nonieee_nnan: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_nonieee_nnan: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call nnan double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_non_ieee_fneg(double %x) #1 { +; GFX6-LABEL: v_floor_f64_non_ieee_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_non_ieee_fneg: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %result = call double @llvm.floor.f64(double %neg.x) + ret double %result +} + +define double @v_floor_f64_fabs(double %x) { +; GFX6-LABEL: v_floor_f64_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], |v[0:1]|, -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], |v[0:1]| +; GFX78-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_floor_f64_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_floor_f64_e64 v[0:1], |v[0:1]| +; GFX7-NEXT: s_setpc_b64 s[30:31] + %abs.x = call double @llvm.fabs.f64(double %x) + %result = call double @llvm.floor.f64(double %abs.x) + ret double %result +} + +define double @v_floor_f64_fneg_fabs(double %x) { +; GFX6-LABEL: v_floor_f64_fneg_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -|v[0:1]|, -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_fneg_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -|v[0:1]| +; GFX78-NEXT: s_setpc_b64 s[30:31] + %abs.x = call double @llvm.fabs.f64(double %x) + %neg.abs.x = fneg double %abs.x + %result = call double @llvm.floor.f64(double %neg.abs.x) + ret double %result +} + +define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) { +; GFX6-LABEL: s_floor_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e32 v[0:1], s[2:3] +; GFX78-NEXT: ; return to shader part epilog + %result = call double @llvm.floor.f64(double %x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) { +; GFX6-LABEL: s_floor_f64_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -s[2:3], -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64_fneg: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -s[2:3] +; GFX78-NEXT: ; return to shader part epilog + %neg.x = fneg double %x + %result = call double @llvm.floor.f64(double %neg.x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) { +; GFX6-LABEL: s_floor_f64_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], |s[2:3]|, -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e64 v[0:1], |s[2:3]| +; GFX78-NEXT: ; return to shader part epilog + %abs.x = call double @llvm.fabs.f64(double %x) + %result = call double @llvm.floor.f64(double %abs.x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) { +; GFX6-LABEL: s_floor_f64_fneg_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -|s[2:3]|, -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64_fneg_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -|s[2:3]| +; GFX78-NEXT: ; return to shader part epilog + %abs.x = call double @llvm.fabs.f64(double %x) + %neg.abs.x = fneg double %abs.x + %result = call double @llvm.floor.f64(double %neg.abs.x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +declare double @llvm.floor.f64(double) #0 +declare double @llvm.fabs.f64(double) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +attributes #1 = { "amdgpu-ieee"="false" } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir @@ -34,8 +34,14 @@ ; SI-LABEL: name: test_ffloor_s64 ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[COPY]] - ; SI: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + ; SI: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[COPY]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY]] + ; SI: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[COPY]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]] + ; SI: $vgpr0_vgpr1 = COPY [[FADD]](s64) ; VI-LABEL: name: test_ffloor_s64 ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; VI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[COPY]] @@ -49,6 +55,65 @@ $vgpr0_vgpr1 = COPY %1 ... + +--- +name: test_ffloor_s64_nnan +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_ffloor_s64_nnan + ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; SI: [[INT:%[0-9]+]]:_(s64) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[COPY]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = nnan G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = nnan G_FNEG [[FMINNUM_IEEE]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = nnan G_FADD [[COPY]], [[FNEG]] + ; SI: $vgpr0_vgpr1 = COPY [[FADD]](s64) + ; VI-LABEL: name: test_ffloor_s64_nnan + ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; VI: [[FFLOOR:%[0-9]+]]:_(s64) = nnan G_FFLOOR [[COPY]] + ; VI: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + ; GFX9-LABEL: name: test_ffloor_s64_nnan + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[FFLOOR:%[0-9]+]]:_(s64) = nnan G_FFLOOR [[COPY]] + ; GFX9: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = nnan G_FFLOOR %0 + $vgpr0_vgpr1 = COPY %1 + +... + +--- +name: test_ffloor_s64_nssaz +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_ffloor_s64_nssaz + ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; SI: [[INT:%[0-9]+]]:_(s64) = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[COPY]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = nsz G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(ord), [[COPY]](s64), [[COPY]] + ; SI: [[SELECT:%[0-9]+]]:_(s64) = nsz G_SELECT [[FCMP]](s1), [[COPY]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = nsz G_FNEG [[SELECT]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = nsz G_FADD [[COPY]], [[FNEG]] + ; SI: $vgpr0_vgpr1 = COPY [[FADD]](s64) + ; VI-LABEL: name: test_ffloor_s64_nssaz + ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; VI: [[FFLOOR:%[0-9]+]]:_(s64) = nsz G_FFLOOR [[COPY]] + ; VI: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + ; GFX9-LABEL: name: test_ffloor_s64_nssaz + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[FFLOOR:%[0-9]+]]:_(s64) = nsz G_FFLOOR [[COPY]] + ; GFX9: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = nsz G_FFLOOR %0 + $vgpr0_vgpr1 = COPY %1 + +... + --- name: test_ffloor_s16 body: | @@ -158,9 +223,20 @@ ; SI-LABEL: name: test_ffloor_v2s64 ; SI: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; SI: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[UV]] - ; SI: [[FFLOOR1:%[0-9]+]]:_(s64) = G_FFLOOR [[UV1]] - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FFLOOR]](s64), [[FFLOOR1]](s64) + ; SI: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[UV]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[UV]](s64), [[UV]] + ; SI: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[UV]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[UV]], [[FNEG]] + ; SI: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[UV1]](s64) + ; SI: [[FMINNUM_IEEE1:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT1]], [[C]] + ; SI: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[UV1]](s64), [[UV1]] + ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[UV1]], [[FMINNUM_IEEE1]] + ; SI: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] + ; SI: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD]](s64), [[FADD1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_ffloor_v2s64 ; VI: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3