Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -901,6 +901,20 @@ } // FIXME: Consider maybe adding intrinsics for sitofp, uitofp. + +// These intrinsics allow to control the rounding mode for fptrunc. +// The rounding mode will be modified before the intrinsics and +// will be restored to the default after the call. + +// Trunc a floating point number with a +inf rounding +def int_experimental_fptrunc_round_upward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty]>; + +// Trunc a floating point number with a -inf rounding +def int_experimental_fptrunc_round_downward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty ]>; + + //===------------------------- Expect Intrinsics --------------------------===// // def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty], Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" @@ -695,6 +696,22 @@ break; } + case AMDGPUISD::FPTRUNC_ROUND_UPWARD: { + SDLoc DL(N); + SDValue ImmZero = CurDAG->getTargetConstant(0, DL, MVT::i1); + ReplaceNode(N, CurDAG->getMachineNode( + AMDGPU::FPTRUNC_UPWARD_PSEUDO, DL, N->getVTList(), + {ImmZero, N->getOperand(1), ImmZero, ImmZero})); + return; + } + case AMDGPUISD::FPTRUNC_ROUND_DOWNWARD: { + SDLoc DL(N); + SDValue ImmZero = CurDAG->getTargetConstant(0, DL, MVT::i1); + ReplaceNode(N, CurDAG->getMachineNode( + AMDGPU::FPTRUNC_DOWNWARD_PSEUDO, DL, N->getVTList(), + {ImmZero, N->getOperand(1), ImmZero, ImmZero})); + return; + } case ISD::INTRINSIC_W_CHAIN: { SelectINTRINSIC_W_CHAIN(N); return; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -483,6 +483,9 @@ CONST_DATA_PTR, PC_ADD_REL_OFFSET, LDS, + FPTRUNC_ROUND_UPWARD, + FPTRUNC_ROUND_DOWNWARD, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4445,6 +4445,8 @@ NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(LDS) + NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) + NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3249,6 +3249,27 @@ case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); + case AMDGPU::G_FPTRUNC_ROUND_UPWARD: + case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: { + Register Dst = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *MBB = I.getParent(); + + uint Opcode = (I.getOpcode() == AMDGPU::G_FPTRUNC_ROUND_UPWARD) + ? AMDGPU::FPTRUNC_UPWARD_PSEUDO + : AMDGPU::FPTRUNC_DOWNWARD_PSEUDO; + + Register SrcReg = I.getOperand(1).getReg(); + + BuildMI(*MBB, &I, DL, TII.get(Opcode), Dst) + .addImm(0) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + + I.eraseFromParent(); + return true; + } case AMDGPU::G_SI_CALL: I.setDesc(TII.get(AMDGPU::SI_CALL)); return true; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -170,6 +170,9 @@ bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeExperimentalFPTrunc(MachineInstr &MI, MachineIRBuilder &B, + Intrinsic::ID IID) const; + bool legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4918,6 +4918,21 @@ return true; } +bool AMDGPULegalizerInfo::legalizeExperimentalFPTrunc(MachineInstr &MI, + MachineIRBuilder &B, + Intrinsic::ID IID) const { + unsigned Opc = IID == Intrinsic::experimental_fptrunc_round_upward + ? AMDGPU::G_FPTRUNC_ROUND_UPWARD + : AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(2).getReg()) + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -5132,6 +5147,9 @@ return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::experimental_fptrunc_round_upward: + case Intrinsic::experimental_fptrunc_round_downward: + return legalizeExperimentalFPTrunc(MI, B, IntrID); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4594,6 +4594,12 @@ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); break; } + case AMDGPU::G_FPTRUNC_ROUND_UPWARD: + case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + } } return getInstructionMapping(/*ID*/1, /*Cost*/1, Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -131,6 +131,8 @@ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue LowerExperimentalFPRound(SDValue Op, unsigned IntrID, + SelectionDAG &DAG) const; /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7554,6 +7554,15 @@ } return Op; } + case Intrinsic::experimental_fptrunc_round_upward: + case Intrinsic::experimental_fptrunc_round_downward: { + unsigned Opc = IntrID == Intrinsic::experimental_fptrunc_round_upward + ? AMDGPUISD::FPTRUNC_ROUND_UPWARD + : AMDGPUISD::FPTRUNC_ROUND_DOWNWARD; + + return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(1), + Op->getOperand(2)); + } case Intrinsic::amdgcn_global_atomic_fadd: if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { DiagnosticInfoUnsupported Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -176,6 +176,19 @@ let mayStore = 0; } +// Pseudo instructions for @llvm.experimental.fptrunc.round.upward +// and @llvm.experimental.fptrunc.round.downward +// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD +// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to +// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO in the ModeRegister pass +let Uses = [MODE, EXEC] in { +def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins i1imm:$imm1, VGPR_32:$src0, i1imm:$imm2, i1imm:$imm3)> {} + +def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins i1imm:$imm1, VGPR_32:$src0, i1imm:$imm2, i1imm:$imm3)> {} +} // End Uses = [MODE, EXEC] + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { @@ -3076,3 +3089,15 @@ // TODO: Should really base this on the call target let isConvergent = 1; } + +def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$vdst); + let InOperandList = (ins unknown:$src0); + let hasSideEffects = 0; +} + +def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$vdst); + let InOperandList = (ins unknown:$src0); + let hasSideEffects = 0; +} Index: llvm/lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -17,6 +17,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineInstr.h" #include #define DEBUG_TYPE "si-mode-register" @@ -162,7 +163,9 @@ // double precision setting. Status SIModeRegister::getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII) { - if (TII->usesFPDPRounding(MI)) { + if (TII->usesFPDPRounding(MI) || + MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || + MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { switch (MI.getOpcode()) { case AMDGPU::V_INTERP_P1LL_F16: case AMDGPU::V_INTERP_P1LV_F16: @@ -170,6 +173,18 @@ // f16 interpolation instructions need double precision round to zero return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + case AMDGPU::FPTRUNC_UPWARD_PSEUDO: { + // Replacing the pseudo by a real instruction + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e64)); + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF)); + } + case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: { + // Replacing the pseudo by a real instruction + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e64)); + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF)); + } default: return DefaultStatus; } Index: llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gs void @test_experimental_fptrunc_round_upward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.upward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_downward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_downward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.downward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_upward_multiple_calls(float %a, float %b, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward_multiple_calls: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v1 +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 +; CHECK-NEXT: v_cvt_f16_f32_e64 v1, v1 +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 +; CHECK-NEXT: v_add_f16_e32 v0, v0, v2 +; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 +; CHECK-NEXT: global_store_short v[7:8], v0, off +; CHECK-NEXT: s_endpgm + %res1 = call half @llvm.experimental.fptrunc.round.upward(float %a) + %res2 = call half @llvm.experimental.fptrunc.round.upward(float %b) + %res3 = call half @llvm.experimental.fptrunc.round.downward(float %b) + %res4 = fadd half %res1, %res2 + %res5 = fadd half %res3, %res4 + store half %res5, half addrspace(1)* %out, align 4 + ret void +} + +declare half @llvm.experimental.fptrunc.round.upward(float) +declare half @llvm.experimental.fptrunc.round.downward(float)