Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -23839,3 +23839,36 @@ The '``llvm.preserve.struct.access.index``' intrinsic produces the same result as a getelementptr with base ``base`` and access operands ``{0, gep_index}``. + +'``llvm.fptrunc.round``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.fptrunc.round( , metadata ) + +Overview: +""""""""" + +The '``llvm.fptrunc.round``' intrinsic truncates +:ref:`floating-point ` ``value`` to type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.fptrunc.round``' intrinsic must be +:ref:`floating point ` or :ref:`vector ` of floating +point values. This argument must be larger in size than the result. + +The second argument specifies the rounding mode as described in the constrained +intrinsics section. + +Semantics: +"""""""""" + +The result produced is a floating point value truncated to be smaller in size +than the operand. Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -901,6 +901,12 @@ } // FIXME: Consider maybe adding intrinsics for sitofp, uitofp. + +// Trunc a floating point number with a specific rounding mode +def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty, llvm_metadata_ty ], + [ IntrNoMem ]>; + //===------------------------- Expect Intrinsics --------------------------===// // def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty], Index: llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2251,6 +2251,34 @@ Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0}; return CLI->lowerCall(MIRBuilder, Info); } + case Intrinsic::fptrunc_round: { + unsigned Flags = MachineInstr::copyFlagsFromInstruction(CI); + + SmallVector VRegs; + VRegs.push_back(getOrCreateVReg(*CI.getArgOperand(0))); + + // Convert the metadata argument to a constant integer + Metadata *MD = cast(CI.getOperand(1))->getMetadata(); + Optional RoundMode = + convertStrToRoundingMode(cast(MD)->getString()); + + VRegs.push_back(getOrCreateVReg(*ConstantInt::get( + Type::getInt8Ty(CI.getContext()), (int)RoundMode.getValue()))); + + ArrayRef ResultRegs; + if (!CI.getType()->isVoidTy()) + ResultRegs = getOrCreateVRegs(CI); + + MachineInstrBuilder MIB = + MIRBuilder.buildIntrinsic(ID, ResultRegs, !CI.doesNotAccessMemory()); + + // Add the Rounding mode as an integer + MIB.addUse(getOrCreateVReg(*CI.getArgOperand(0))); + MIB.addImm((int)RoundMode.getValue()); + MIB->setFlags(Flags); + + return MIB; + } #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6332,6 +6332,68 @@ #include "llvm/IR/VPIntrinsics.def" visitVectorPredicationIntrinsic(cast(I)); return; + case Intrinsic::fptrunc_round: { + const Function *F = I.getCalledFunction(); + + // Build the operand list. + SmallVector Ops; + + // Get the last argument, the metadata and convert it to an integer in the + // call + Metadata *MD = cast(I.getOperand(1))->getMetadata(); + Optional RoundMode = + convertStrToRoundingMode(cast(MD)->getString()); + + TargetLowering::IntrinsicInfo Info; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Add the intrinsic ID as an integer operand. + Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(), + TLI.getPointerTy(DAG.getDataLayout()))); + + // Add the first operand. + const Value *Arg = I.getArgOperand(0); + if (!I.paramHasAttr(0, Attribute::ImmArg)) + Ops.push_back(getValue(Arg)); + + // Convert the last operand to an integer and add it to the list + Ops.push_back( + DAG.getTargetConstant((int)RoundMode.getValue(), sdl, MVT::i8)); + + SmallVector ValueVTs; + ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); + + SDVTList VTs = DAG.getVTList(ValueVTs); + + // Propagate fast-math-flags from IR to node(s). + SDNodeFlags Flags; + if (auto *FPMO = dyn_cast(&I)) + Flags.copyFMF(*FPMO); + SelectionDAG::FlagInserter FlagsInserter(DAG, Flags); + + SDValue Result; + Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops); + + if (VectorType *PTy = dyn_cast(I.getType())) { + EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy); + Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result); + } else + Result = lowerRangeToAssertZExt(DAG, I, Result); + + MaybeAlign Alignment = I.getRetAlign(); + if (!Alignment) + Alignment = F->getAttributes().getRetAlignment(); + + // Insert `assertalign` node if there's an alignment. + if (InsertAssertAlign && Alignment) { + Result = + DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne()); + } + + setValue(&I, Result); + + return; + } case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -4738,6 +4738,18 @@ "an array"); break; } + case Intrinsic::fptrunc_round: { + // Check the rounding mode + Metadata *MD = nullptr; + auto *MAV = dyn_cast(Call.getOperand(1)); + if (MAV) + MD = MAV->getMetadata(); + + Optional RoundMode = + convertStrToRoundingMode(cast(MD)->getString()); + Assert(RoundMode.hasValue(), "invalid rounding mode argument", Call); + break; + } #define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -695,6 +695,22 @@ break; } + case AMDGPUISD::FPTRUNC_ROUND_UPWARD: { + SDLoc DL(N); + SDValue ImmZero = CurDAG->getTargetConstant(0, DL, MVT::i1); + ReplaceNode(N, CurDAG->getMachineNode( + AMDGPU::FPTRUNC_UPWARD_PSEUDO, DL, N->getVTList(), + {ImmZero, N->getOperand(1), ImmZero, ImmZero})); + return; + } + case AMDGPUISD::FPTRUNC_ROUND_DOWNWARD: { + SDLoc DL(N); + SDValue ImmZero = CurDAG->getTargetConstant(0, DL, MVT::i1); + ReplaceNode(N, CurDAG->getMachineNode( + AMDGPU::FPTRUNC_DOWNWARD_PSEUDO, DL, N->getVTList(), + {ImmZero, N->getOperand(1), ImmZero, ImmZero})); + return; + } case ISD::INTRINSIC_W_CHAIN: { SelectINTRINSIC_W_CHAIN(N); return; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -483,6 +483,9 @@ CONST_DATA_PTR, PC_ADD_REL_OFFSET, LDS, + FPTRUNC_ROUND_UPWARD, + FPTRUNC_ROUND_DOWNWARD, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4445,6 +4445,8 @@ NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(LDS) + NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) + NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3249,6 +3249,27 @@ case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: return selectG_SBFX_UBFX(I); + case AMDGPU::G_FPTRUNC_ROUND_UPWARD: + case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: { + Register Dst = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *MBB = I.getParent(); + + uint Opcode = (I.getOpcode() == AMDGPU::G_FPTRUNC_ROUND_UPWARD) + ? AMDGPU::FPTRUNC_UPWARD_PSEUDO + : AMDGPU::FPTRUNC_DOWNWARD_PSEUDO; + + Register SrcReg = I.getOperand(1).getReg(); + + BuildMI(*MBB, &I, DL, TII.get(Opcode), Dst) + .addImm(0) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + + I.eraseFromParent(); + return true; + } case AMDGPU::G_SI_CALL: I.setDesc(TII.get(AMDGPU::SI_CALL)); return true; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -170,6 +170,9 @@ bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B, + Intrinsic::ID IID) const; + bool legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4918,6 +4918,29 @@ return true; } +bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, + MachineIRBuilder &B, + Intrinsic::ID IID) const { + unsigned Opc; + int RoundMode = MI.getOperand(3).getImm(); + + if (RoundMode == (int)RoundingMode::TowardPositive) + Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; + else if (RoundMode == (int)RoundingMode::TowardNegative) + Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; + else + llvm_unreachable("unsupported rounding mode"); + + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(2).getReg()) + .cloneMemRefs(MI); + + MI.eraseFromParent(); + + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -5132,6 +5155,8 @@ return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::fptrunc_round: + return legalizeFPTruncRound(MI, B, IntrID); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4594,6 +4594,12 @@ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); break; } + case AMDGPU::G_FPTRUNC_ROUND_UPWARD: + case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + } } return getInstructionMapping(/*ID*/1, /*Cost*/1, Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -131,6 +131,8 @@ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue LowerExperimentalFPRound(SDValue Op, unsigned IntrID, + SelectionDAG &DAG) const; /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -6811,6 +6812,21 @@ Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } + case Intrinsic::fptrunc_round: { + unsigned Opc; + + // Get the rounding mode from the last operand + int RoundMode = cast(Op.getOperand(2))->getZExtValue(); + if (RoundMode == (int)RoundingMode::TowardPositive) + Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD; + else if (RoundMode == (int)RoundingMode::TowardNegative) + Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD; + else + llvm_unreachable("unsupported rounding mode"); + + return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0), + Op->getOperand(1)); + } case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -176,6 +176,19 @@ let mayStore = 0; } +// Pseudo instructions used for @llvm.fptrunc.round upward +// and @llvm.fptrunc.round downward. +// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD +// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to +// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO in the ModeRegister pass +let Uses = [MODE, EXEC] in { +def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins i1imm:$imm1, VGPR_32:$src0, i1imm:$imm2, i1imm:$imm3)> {} + +def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins i1imm:$imm1, VGPR_32:$src0, i1imm:$imm2, i1imm:$imm3)> {} +} // End Uses = [MODE, EXEC] + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { @@ -3076,3 +3089,15 @@ // TODO: Should really base this on the call target let isConvergent = 1; } + +def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$vdst); + let InOperandList = (ins unknown:$src0); + let hasSideEffects = 0; +} + +def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$vdst); + let InOperandList = (ins unknown:$src0); + let hasSideEffects = 0; +} Index: llvm/lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -162,7 +162,9 @@ // double precision setting. Status SIModeRegister::getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII) { - if (TII->usesFPDPRounding(MI)) { + if (TII->usesFPDPRounding(MI) || + MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || + MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { switch (MI.getOpcode()) { case AMDGPU::V_INTERP_P1LL_F16: case AMDGPU::V_INTERP_P1LV_F16: @@ -170,6 +172,18 @@ // f16 interpolation instructions need double precision round to zero return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + case AMDGPU::FPTRUNC_UPWARD_PSEUDO: { + // Replacing the pseudo by a real instruction + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e64)); + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF)); + } + case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: { + // Replacing the pseudo by a real instruction + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e64)); + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF)); + } default: return DefaultStatus; } Index: llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gs void @test_fptrunc_round_upward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_fptrunc_round_upward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.fptrunc.round(float %a, metadata !"round.upward") + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_fptrunc_round_downward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_fptrunc_round_downward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.fptrunc.round(float %a, metadata !"round.downward") + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_fptrunc_round_upward_multiple_calls(float %a, float %b, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_fptrunc_round_upward_multiple_calls: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v1 +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2 +; CHECK-NEXT: v_cvt_f16_f32_e64 v1, v1 +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0 +; CHECK-NEXT: v_add_f16_e32 v0, v0, v2 +; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 +; CHECK-NEXT: global_store_short v[7:8], v0, off +; CHECK-NEXT: s_endpgm + %res1 = call half @llvm.fptrunc.round(float %a, metadata !"round.upward") + %res2 = call half @llvm.fptrunc.round(float %b, metadata !"round.upward") + %res3 = call half @llvm.fptrunc.round(float %b, metadata !"round.downward") + %res4 = fadd half %res1, %res2 + %res5 = fadd half %res3, %res4 + store half %res5, half addrspace(1)* %out, align 4 + ret void +} + +declare half @llvm.fptrunc.round(float, metadata)