Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -901,6 +901,20 @@ } // FIXME: Consider maybe adding intrinsics for sitofp, uitofp. + +// These intrinsics allow to control the rounding mode for fptrunc. +// The rounding mode will be modified before the intrinsics and +// will be restored to the default after the call. + +// Trunc a floating point number with a +inf rounding +def int_experimental_fptrunc_round_upward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty]>; + +// Trunc a floating point number with a -inf rounding +def int_experimental_fptrunc_round_downward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty ]>; + + //===------------------------- Expect Intrinsics --------------------------===// // def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty], Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4497,6 +4497,16 @@ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::experimental_fptrunc_round_upward: + case Intrinsic::experimental_fptrunc_round_downward: { + unsigned Bank = + getRegBankID(MI.getOperand(2).getReg(), MRI, AMDGPU::SGPRRegBankID); + + unsigned SizeDst = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeDst); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } default: return getInvalidInstructionMapping(); } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -131,6 +131,8 @@ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue LowerExperimentalFPRound(SDValue Op, unsigned IntrID, + SelectionDAG &DAG) const; /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -176,6 +176,18 @@ let mayStore = 0; } +// Pseudo instructions for @llvm.experimental.fptrunc.round.upward +// and @llvm.experimental.fptrunc.round.downward +let Uses = [MODE, EXEC] in { +def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0), + [(set f16:$vdst, (int_experimental_fptrunc_round_upward f32:$src0))]>; + +def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0), + [(set f16:$vdst, (int_experimental_fptrunc_round_downward f32:$src0))]>; +} // End Uses = [MODE, EXEC] + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { Index: llvm/lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -134,6 +134,8 @@ MachineFunctionPass::getAnalysisUsage(AU); } + void selectSRoundModePseudos(MachineBasicBlock &MBB, const SIInstrInfo *TII); + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); @@ -401,6 +403,78 @@ } } +// In this phase we are selecting pseudo-instructions that need a special mode. +// These pseudo-instructions were added by the usage of +// @llvm.experimental.fptrunc.round.upward and +// @llvm.experimental.fptrunc.round.downward +// It's better to do this in this pass to optimize a sequence of these +// intrinsics by avoiding to insert redundant s_round_mode instructions. +// Select FPTRUNC_UPWARD_PSEUDO with +// s_round_mode 0x4 +// v_cvt_f16_f32_e64 v0, v0 +// s_round_mode 0x0 +// And FPTRUNC_DOWNWARD_PSEUDO with +// s_round_mode 0x8 +// v_cvt_f16_f32_e64 v0, v0 +// s_round_mode 0x0 +void SIModeRegister::selectSRoundModePseudos(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + + // Collect FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO instructions + // These are pseudo-instructions and we need to select them now + std::vector FPTruncPseudoInstructions; + for (MachineInstr &MI : MBB) { + MachineInstr *PrevInst = MI.getPrevNode(); + + // If the previous instruction is null or if it's not another + // intrinsic of the same type, insert a s_round_mode + if (MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO) { + if (PrevInst == nullptr || + (PrevInst != nullptr && + PrevInst->getOpcode() != AMDGPU::FPTRUNC_UPWARD_PSEUDO)) { + // We need to insert a S_ROUND_MODE here + uint MODE = FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::S_ROUND_MODE)) + .addImm(MODE); + } + FPTruncPseudoInstructions.push_back(&MI); + } else if (MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { + if (PrevInst == nullptr || + (PrevInst != nullptr && + PrevInst->getOpcode() != AMDGPU::FPTRUNC_DOWNWARD_PSEUDO)) { + // We need to insert a S_ROUND_MODE here + uint MODE = FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::S_ROUND_MODE)) + .addImm(MODE); + } + FPTruncPseudoInstructions.push_back(&MI); + } else if (PrevInst != nullptr && + (PrevInst->getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || + PrevInst->getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO)) { + // This is the end of a sequence of fptrunc, restore the default + uint MODE = FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::S_ROUND_MODE)) + .addImm(MODE); + } + } + + // Finally, replace the pseudo-instructions with real ones + for (MachineInstr *MI : FPTruncPseudoInstructions) { + const DebugLoc &DL = MI->getDebugLoc(); + MachineOperand &Dest = MI->getOperand(0); + MachineOperand &Src0 = MI->getOperand(1); + + // Build a MI to do the conversion + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CVT_F16_F32_e64), Dest.getReg()) + .addImm(0) + .add(Src0) + .addImm(0) + .addImm(0); + + MI->eraseFromParent(); + } +} + bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { BlockInfo.resize(MF.getNumBlockIDs()); const GCNSubtarget &ST = MF.getSubtarget(); @@ -428,6 +502,10 @@ for (MachineBasicBlock &BB : MF) processBlockPhase3(BB, TII); + // Select some pseudo-instructions that need a special mode + for (MachineBasicBlock &BB : MF) + selectSRoundModePseudos(BB, TII); + BlockInfo.clear(); return Changed; Index: llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gs void @test_experimental_fptrunc_round_upward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_round_mode 0x4 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.upward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_downward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_downward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_round_mode 0x8 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.downward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_upward_multiple_calls(float %a, float %b, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward_multiple_calls: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_round_mode 0x4 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v1 +; CHECK-NEXT: s_round_mode 0x8 +; CHECK-NEXT: v_cvt_f16_f32_e64 v1, v1 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: v_add_f16_e32 v0, v0, v2 +; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 +; CHECK-NEXT: global_store_short v[7:8], v0, off +; CHECK-NEXT: s_endpgm + %res1 = call half @llvm.experimental.fptrunc.round.upward(float %a) + %res2 = call half @llvm.experimental.fptrunc.round.upward(float %b) + %res3 = call half @llvm.experimental.fptrunc.round.downward(float %b) + %res4 = fadd half %res1, %res2 + %res5 = fadd half %res3, %res4 + store half %res5, half addrspace(1)* %out, align 4 + ret void +} + +declare half @llvm.experimental.fptrunc.round.upward(float) +declare half @llvm.experimental.fptrunc.round.downward(float)