Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -901,6 +901,20 @@ } // FIXME: Consider maybe adding intrinsics for sitofp, uitofp. + +// These intrinsics allow to control the rounding mode for fptrunc. +// The rounding mode will be modified before the intrinsics and +// will be restored to the default after the call. + +// Trunc a floating point number with a +inf rounding +def int_experimental_fptrunc_round_upward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty]>; + +// Trunc a floating point number with a -inf rounding +def int_experimental_fptrunc_round_downward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty ]>; + + //===------------------------- Expect Intrinsics --------------------------===// // def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty], Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4497,6 +4497,16 @@ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::experimental_fptrunc_round_upward: + case Intrinsic::experimental_fptrunc_round_downward: { + unsigned Bank = + getRegBankID(MI.getOperand(2).getReg(), MRI, AMDGPU::SGPRRegBankID); + + unsigned SizeDst = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeDst); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } default: return getInvalidInstructionMapping(); } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -131,6 +131,8 @@ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue LowerExperimentalFPRound(SDValue Op, unsigned IntrID, + SelectionDAG &DAG) const; /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -176,6 +176,18 @@ let mayStore = 0; } +// Pseudo instructions for @llvm.experimental.fptrunc.round.upward +// and @llvm.experimental.fptrunc.round.downward +let Uses = [MODE, EXEC] in { +def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0), + [(set f16:$vdst, (int_experimental_fptrunc_round_upward f32:$src0))]>; + +def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0), + [(set f16:$vdst, (int_experimental_fptrunc_round_downward f32:$src0))]>; +} // End Uses = [MODE, EXEC] + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { Index: llvm/lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -134,6 +134,8 @@ MachineFunctionPass::getAnalysisUsage(AU); } + void selectModePseudos(MachineBasicBlock &MBB, const SIInstrInfo *TII); + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); @@ -162,7 +164,9 @@ // double precision setting. Status SIModeRegister::getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII) { - if (TII->usesFPDPRounding(MI)) { + if (TII->usesFPDPRounding(MI) || + MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || + MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { switch (MI.getOpcode()) { case AMDGPU::V_INTERP_P1LL_F16: case AMDGPU::V_INTERP_P1LV_F16: @@ -170,6 +174,12 @@ // f16 interpolation instructions need double precision round to zero return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + case AMDGPU::FPTRUNC_UPWARD_PSEUDO: + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF)); + case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF)); default: return DefaultStatus; } @@ -401,6 +411,43 @@ } } +// In this phase we are selecting pseudo-instructions that need a special mode. +// These pseudo-instructions were added by the usage of +// @llvm.experimental.fptrunc.round.upward and +// @llvm.experimental.fptrunc.round.downward +// It's better to do this in this pass to optimize a sequence of these +// intrinsics by avoiding to change too often the mode register. +// FPTRUNC_UPWARD_PSEUDO needs a rounding to FP_ROUND_ROUND_TO_INF +// And FPTRUNC_DOWNWARD_PSEUDO needs a rounding to FP_ROUND_ROUND_TO_NEGINF +void SIModeRegister::selectModePseudos(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + // Find FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO instructions + // These are pseudo-instructions and we need to select them now + std::vector ToDelete; + + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || + MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { + ToDelete.push_back(&MI); + // The appropriate changes to the mode register have been made as this + // point. + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + + // Build a MI to do the conversion + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CVT_F16_F32_e64), Dest.getReg()) + .addImm(0) + .add(Src0) + .addImm(0) + .addImm(0); + } + } + + for (MachineInstr *MI : ToDelete) + MI->eraseFromParent(); +} + bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { BlockInfo.resize(MF.getNumBlockIDs()); const GCNSubtarget &ST = MF.getSubtarget(); @@ -428,6 +475,10 @@ for (MachineBasicBlock &BB : MF) processBlockPhase3(BB, TII); + // Select some pseudo-instructions that need a special mode + for (MachineBasicBlock &BB : MF) + selectModePseudos(BB, TII); + BlockInfo.clear(); return Changed; Index: llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gs void @test_experimental_fptrunc_round_upward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.upward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_downward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_downward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.downward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_upward_multiple_calls(float %a, float %b, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward_multiple_calls: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v1 +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 +; CHECK-NEXT: v_cvt_f16_f32_e64 v1, v1 +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 +; CHECK-NEXT: v_add_f16_e32 v0, v0, v2 +; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 +; CHECK-NEXT: global_store_short v[7:8], v0, off +; CHECK-NEXT: s_endpgm + %res1 = call half @llvm.experimental.fptrunc.round.upward(float %a) + %res2 = call half @llvm.experimental.fptrunc.round.upward(float %b) + %res3 = call half @llvm.experimental.fptrunc.round.downward(float %b) + %res4 = fadd half %res1, %res2 + %res5 = fadd half %res3, %res4 + store half %res5, half addrspace(1)* %out, align 4 + ret void +} + +declare half @llvm.experimental.fptrunc.round.upward(float) +declare half @llvm.experimental.fptrunc.round.downward(float)