Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -901,6 +901,14 @@ } // FIXME: Consider maybe adding intrinsics for sitofp, uitofp. + +def int_experimental_fptrunc_round_upward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty]>; + +def int_experimental_fptrunc_round_downward : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyfloat_ty ]>; + + //===------------------------- Expect Intrinsics --------------------------===// // def int_expect : DefaultAttrsIntrinsic<[llvm_anyint_ty], Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2692,6 +2692,7 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + unsigned Opcode; switch (IntrID) { case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { @@ -2700,6 +2701,16 @@ SelectDSAppendConsume(N, IntrID); return; } + case Intrinsic::experimental_fptrunc_round_upward: { + Opcode = AMDGPU::FPTRUNC_UPWARD_PSEUDO; + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {N->getOperand(2)}); + return; + } + case Intrinsic::experimental_fptrunc_round_downward: { + Opcode = AMDGPU::FPTRUNC_DOWNWARD_PSEUDO; + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {N->getOperand(2)}); + return; + } } SelectCode(N); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -123,6 +123,8 @@ bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectSBarrier(MachineInstr &MI) const; + bool selectExperimentalFPTruncRound(MachineInstr &MI, + Intrinsic::ID IID) const; bool selectImageIntrinsic(MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1489,6 +1489,25 @@ return selectImpl(MI, *CoverageInfo); } +bool AMDGPUInstructionSelector::selectExperimentalFPTruncRound( + MachineInstr &MI, Intrinsic::ID IID) const { + Register Dst = MI.getOperand(0).getReg(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + + uint Opcode = (IID == Intrinsic::experimental_fptrunc_round_upward) + ? AMDGPU::FPTRUNC_UPWARD_PSEUDO + : AMDGPU::FPTRUNC_DOWNWARD_PSEUDO; + + Register SrcReg = MI.getOperand(2).getReg(); + + BuildMI(*MBB, &MI, DL, TII.get(Opcode), Dst).addReg(SrcReg); + + MI.eraseFromParent(); + + return true; +} + static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail) { if (TexFailCtrl) @@ -1793,6 +1812,9 @@ return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); + case Intrinsic::experimental_fptrunc_round_upward: + case Intrinsic::experimental_fptrunc_round_downward: + return selectExperimentalFPTruncRound(I, IntrinsicID); default: { return selectImpl(I, *CoverageInfo); } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4497,6 +4497,16 @@ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::experimental_fptrunc_round_upward: + case Intrinsic::experimental_fptrunc_round_downward: { + unsigned Bank = + getRegBankID(MI.getOperand(2).getReg(), MRI, AMDGPU::SGPRRegBankID); + + unsigned SizeDst = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeDst); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } default: return getInvalidInstructionMapping(); } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -131,6 +131,8 @@ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue LowerExperimentalFPRound(SDValue Op, unsigned IntrID, + SelectionDAG &DAG) const; /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3973,6 +3973,34 @@ SIMachineFunctionInfo *MFI = MF->getInfo(); switch (MI.getOpcode()) { + // For v_cvt_f16_f32, the output rounding mode is based on DP rounding mode + case AMDGPU::FPTRUNC_UPWARD_PSEUDO: + case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: { + uint MODE = (MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO) + ? FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF) + : FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF); + + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + + // Change the MODE register before the conversion + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ROUND_MODE)).addImm(MODE); + + // Build a MI to do the conversion + BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CVT_F16_F32_e64), Dest.getReg()) + .addImm(0) + .add(Src0) + .addImm(0) + .addImm(0); + + // Finally restore the default rounding mode + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ROUND_MODE)) + .addImm(FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST)); + + MI.eraseFromParent(); + return BB; + } case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -176,6 +176,13 @@ let mayStore = 0; } +// Pseudo instructions for @llvm.experimental.fptrunc.round.upward +// and @llvm.experimental.fptrunc.round.downward +let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [MODE, EXEC] in { +def FPTRUNC_UPWARD_PSEUDO : SPseudoInstSI <(outs unknown:$sdst), (ins unknown:$src0)>; +def FPTRUNC_DOWNWARD_PSEUDO : SPseudoInstSI <(outs unknown:$sdst), (ins unknown:$src0)>; +} // End let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [MODE, EXEC] + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { Index: llvm/lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -134,6 +134,8 @@ MachineFunctionPass::getAnalysisUsage(AU); } + void processSRoundMode(MachineBasicBlock &MBB, const SIInstrInfo *TII); + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); @@ -401,6 +403,64 @@ } } +// In this phase we iterate through the instructions of the block and +// we are looking for potential optimizations of s_round_mode instructions. +// For example with a pattern like this one: +// s_round_mode 0x4 +// v_cvt_f16_f32_e32 v0, v0 +// s_round_mode 0x0 +// s_round_mode 0x4 +// v_cvt_f16_f32_e32 v1, v1 +// s_round_mode 0x0 +// These s_round_mode instructions were probably added by the usage of +// an intrinsic. Try to merge them if possible. +void SIModeRegister::processSRoundMode(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + + std::vector> SecondPhase; + + // Collect triplets of instructions that are of the form: + // S_ROUND_MODE + // instruction + // S_ROUND_MODE + for (MachineInstr &MI : MBB) { + MachineInstr *PrevInst = MI.getPrevNode(); + MachineInstr *NextInst = MI.getNextNode(); + if (PrevInst != nullptr && NextInst != nullptr && + PrevInst->getOpcode() == AMDGPU::S_ROUND_MODE && + NextInst->getOpcode() == AMDGPU::S_ROUND_MODE) { + std::vector candidate; + candidate.push_back(PrevInst); + candidate.push_back(&MI); + candidate.push_back(NextInst); + SecondPhase.push_back(candidate); + } + } + + // We need at least two triplets to merge them + if (SecondPhase.size() <= 1) + return; + + std::vector PreviousCandidate; + + // Iterate over the triplets and try to merge some of them + for (std::vector candidate : SecondPhase) { + if (PreviousCandidate.size() > 0) { + // Test if the S_ROUND_MODE are compatible + if (candidate[0]->getOperand(0).getImm() == + PreviousCandidate[0]->getOperand(0).getImm() && + candidate[2]->getOperand(0).getImm() == + PreviousCandidate[2]->getOperand(0).getImm()) { + // We only need one S_ROUND_MODE at the beginning/end of the sequence. + // Delete the other ones. + PreviousCandidate[2]->removeFromParent(); + candidate[0]->removeFromParent(); + } + } + PreviousCandidate = candidate; + } +} + bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { BlockInfo.resize(MF.getNumBlockIDs()); const GCNSubtarget &ST = MF.getSubtarget(); @@ -428,6 +488,10 @@ for (MachineBasicBlock &BB : MF) processBlockPhase3(BB, TII); + // Process the S_ROUND_MODE instructions in a different phase + for (MachineBasicBlock &BB : MF) + processSRoundMode(BB, TII); + BlockInfo.clear(); return Changed; Index: llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.experimental.fptrunc.round.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gs void @test_experimental_fptrunc_round_upward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_round_mode 0x4 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.upward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_downward(float %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_downward: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_round_mode 0x8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: global_store_short v[6:7], v0, off +; CHECK-NEXT: s_endpgm + %res = call half @llvm.experimental.fptrunc.round.downward(float %a) + store half %res, half addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_experimental_fptrunc_round_upward_multiple_calls(float %a, float %b, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) { +; CHECK-LABEL: test_experimental_fptrunc_round_upward_multiple_calls: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_round_mode 0x4 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v1 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: s_round_mode 0x8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CHECK-NEXT: s_round_mode 0x0 +; CHECK-NEXT: v_add_f16_e32 v0, v0, v2 +; CHECK-NEXT: v_add_f16_e32 v0, v1, v0 +; CHECK-NEXT: global_store_short v[7:8], v0, off +; CHECK-NEXT: s_endpgm + %res1 = call half @llvm.experimental.fptrunc.round.upward(float %a) + %res2 = call half @llvm.experimental.fptrunc.round.upward(float %b) + %res3 = call half @llvm.experimental.fptrunc.round.downward(float %b) + %res4 = fadd half %res1, %res2 + %res5 = fadd half %res3, %res4 + store half %res5, half addrspace(1)* %out, align 4 + ret void +} + +declare half @llvm.experimental.fptrunc.round.upward(float) +declare half @llvm.experimental.fptrunc.round.downward(float)