Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -974,6 +974,20 @@ [IntrNoMem, IntrSpeculatable]>; // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. +// __builtin_amdgcn_interp_p1_f16 , , , , +def int_amdgcn_interp_p1_f16 : + GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">, + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + +// __builtin_amdgcn_interp_p2_f16 , , , , , +def int_amdgcn_interp_p2_f16 : + GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">, + Intrinsic<[llvm_half_ty], + [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + // Pixel shaders only: whether the current pixel is live (i.e. not a helper // invocation for derivative computation). def int_amdgcn_ps_live : Intrinsic < Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -57,6 +57,7 @@ FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createSIModeRegisterPass(); void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); @@ -179,6 +180,9 @@ void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; +void initializeSIModeRegisterPass(PassRegistry&); +extern char &SIModeRegisterID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -455,6 +455,9 @@ INTERP_MOV, INTERP_P1, INTERP_P2, + INTERP_P1LL_F16, + INTERP_P1LV_F16, + INTERP_P2_F16, PC_ADD_REL_OFFSET, KILL, DUMMY_CHAIN, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4063,6 +4063,9 @@ NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) + NODE_NAME_CASE(INTERP_P1LL_F16) + NODE_NAME_CASE(INTERP_P1LV_F16) + NODE_NAME_CASE(INTERP_P2_F16) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -372,6 +372,17 @@ SDTypeProfile<1, 4, [SDTCisFP<0>]>, [SDNPInGlue]>; +def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", + SDTypeProfile<1, 7, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16", + SDTypeProfile<1, 9, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16", + SDTypeProfile<1, 8, [SDTCisFP<0>]>, + [SDNPInGlue]>; def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, [SDNPHasChain, SDNPSideEffect]>; Index: lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -49,6 +49,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -136,6 +136,13 @@ cl::init(true), cl::Hidden); +// Enable Mode register optimization +static cl::opt EnableSIModeRegisterPass( + "amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -171,6 +178,7 @@ initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitcntsPass(*PR); + initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); @@ -884,6 +892,7 @@ } void GCNPassConfig::addPreEmitPass() { + addPass(createSIModeRegisterPass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -115,6 +115,7 @@ SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + SIModeRegister.cpp ) add_subdirectory(AsmParser) Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -88,7 +88,10 @@ IsPacked = UINT64_C(1) << 49, // Is a D16 buffer instruction. - D16Buf = UINT64_C(1) << 50 + D16Buf = UINT64_C(1) << 50, + + // Uses floating point double precision rounding mode + FPDPRounding = UINT64_C(1) << 51 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4838,6 +4838,59 @@ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_interp_p1_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = M0.getValue(1); + if (getSubtarget()->getLDSBankCount() == 16) { + // 16 bank LDS + SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Glue); + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + S, // Src2 - holds two f16 values selected by high + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32) // $omod + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); + } else { + // 32 bank LDS + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32), // $omod + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); + } + } + case Intrinsic::amdgcn_interp_p2_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); + SDValue Glue = SDValue(M0.getNode(), 1); + SDValue Ops[] = { + Op.getOperand(2), // Src0 + Op.getOperand(3), // Attrchan + Op.getOperand(4), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(1), // Src2 + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(5), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); + } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -121,6 +121,10 @@ // This bit indicates that this is a D16 buffer instruction. field bit D16Buf = 0; + // This bit indicates that this uses the floating point double precision + // rounding mode flags + field bit FPDPRounding = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -178,6 +182,8 @@ let TSFlags{50} = D16Buf; + let TSFlags{51} = FPDPRounding; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -589,6 +589,14 @@ return MI.getDesc().TSFlags & ClampFlags; } + static bool usesFPDPRounding(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; + } + + bool usesFPDPRounding(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); Index: lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIModeRegister.cpp @@ -0,0 +1,207 @@ +//===-- SIModeRegister.cpp - Mode Register --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass inserts changes to the Mode register settings as required. +/// Currently only the double precision floating point rounding mode setting is +/// handled. +//===----------------------------------------------------------------------===// +// +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "AMDGPUInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-mode-register" + +STATISTIC(NumSetregInserted, + "Number of setreg of mode register inserted."); + +using namespace llvm; + +namespace { + +class SIModeRegister : public MachineFunctionPass { +public: + static char ID; + unsigned stop; + SmallVector complete; + SmallVector revisits; + SmallVector canSkipBlock; + + // We currently assume the default rounding mode is Round to Nearest + // NOTE: this should come from a per function rounding mode setting once such + // a setting exists. + int defaultMode = FP_ROUND_ROUND_TO_NEAREST; + +public: + SIModeRegister() : MachineFunctionPass(ID) { + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool processBlock(MachineBasicBlock &MBB, const SIInstrInfo *TII, + int currentMode); + + int getRequiredMode(unsigned opcode); +}; +} // End anonymous namespace. + +INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, + "Insert required mode register values", false, false) + +char SIModeRegister::ID = 0; + +char &llvm::SIModeRegisterID = SIModeRegister::ID; + +FunctionPass *llvm::createSIModeRegisterPass() { + return new SIModeRegister(); +} + +// Determine the DP FP rounding mode required for this instruction +int SIModeRegister::getRequiredMode(unsigned opcode) { + switch (opcode) { + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P2_F16: + // f16 interpolation instructions need round to zero + return FP_ROUND_ROUND_TO_ZERO; + default: + return defaultMode; + } +} + +// We iterate through the instructions of each block and for any that use the +// FP DP rounding mode we check that the current mode is appropropriate. If +// not we insert a setreg to change it. If we find a setreg that modifies the +// rounding mode we track that as the current value. +// We then recursively propagate the final value to all the successor blocks. +// For back-edges we need to revisit blocks until we revisit a block and find +// an instruction that uses the DP rounding mode or as setreg that modifies it +// (in those cases we know successor blocks already have the required modes set) +// or we visit a block for the second time (we know there are no instructions +// that use or set the FP DP rounding mode) +bool SIModeRegister::processBlock(MachineBasicBlock &MBB, + const SIInstrInfo *TII, int currentMode) { + if (!canSkipBlock[MBB.getNumber()]) { + canSkipBlock[MBB.getNumber()] = true; + MachineBasicBlock::iterator I, Next; + for (I = MBB.SkipPHIsLabelsAndDebug(MBB.begin()); I != MBB.end(); + I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + if (TII->usesFPDPRounding(MI)) { + canSkipBlock[MBB.getNumber()] = false; + // This instruction uses the DP rounding mode - check that the current + // mode is suitable, and if not insert a setreg to change the mode + int requiredMode = getRequiredMode(MI.getOpcode()); + if (currentMode != requiredMode) { + currentMode = requiredMode; + BuildMI(MBB, I, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(currentMode).addImm(0x881); + ++NumSetregInserted; + } + if ((revisits[MBB.getNumber()] >= 1 )) + return true; + } else if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || + (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + // track changes to the rounding mode + + canSkipBlock[MBB.getNumber()] = false; + + // ignore setreg if not writing to MODE register + unsigned dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16) + ->getImm(); + if (((dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != + AMDGPU::Hwreg::ID_MODE) + continue; + + unsigned width = ((dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> + AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + 1; + unsigned offset = (dst & AMDGPU::Hwreg::OFFSET_MASK_) >> + AMDGPU::Hwreg::OFFSET_SHIFT_; + unsigned mask = ((1 << width) - 1) << offset; + + // skip if not updating any part of the DP rounding mode + if ((mask & FP_ROUND_MODE_DP(3)) == 0) + continue; + // it is possible for the setreg to update only part of the DP mode + // field so we'll mask the current and new modes appropriately - + // however, if we don't know the current mode we can't use a partial + // value + bool partial = ((mask & FP_ROUND_MODE_DP(3)) != FP_ROUND_MODE_DP(3)); + if (partial && (currentMode == -1)) + continue; + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + unsigned val = TII->getNamedOperand(MI, AMDGPU::OpName::imm) + ->getImm(); + currentMode = (((val << offset) & FP_ROUND_MODE_DP(3)) | + ((FP_ROUND_MODE_DP(currentMode) & ~mask))) >> 2; + } else + currentMode = -1; + // if it was a partial update we may have a different currentMode from + // values via different paths so we need to continue the propagation, + // otherwise if we are revisiting the block we can return + if ((revisits[MBB.getNumber()] >= 1 ) && !partial) + return true; + } + } + } + + // propagate the current mode to all successor blocks + bool successorsComplete = true; + if (revisits[MBB.getNumber()] < 2) { + ++revisits[MBB.getNumber()]; + MachineBasicBlock::succ_iterator S; + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); S != E; S = std::next(S)) { + MachineBasicBlock &B = *(*S); + if (!complete[B.getNumber()]) + if (!processBlock(B, TII, currentMode)) + successorsComplete = false; + } + --revisits[MBB.getNumber()]; + } + complete[MBB.getNumber()] = successorsComplete; + return successorsComplete; +} + +// The DP Rounding flags within the Mode register are used to control both +// 64 bit and 16 bit floating point rounding behavior. +// The 16 bit interpolation instructions require Round to Zero for correct +// results, so explicit mode changes may need to be inserted to ensure +// each instruction has the required mode. +// Other mode register settings may need to be tracked in the future. +bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + revisits.resize(MF.getNumBlockIDs()); + complete.resize(MF.getNumBlockIDs()); + canSkipBlock.resize(MF.getNumBlockIDs()); + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineFunction::iterator BI = MF.begin(); + processBlock(*BI, TII, defaultMode); + + return NumSetregInserted > 0; +} Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -203,14 +203,14 @@ defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; } // End SchedRW = [WriteQuarterRate32] -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDouble], FPDPRounding = 1 in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; } // End SchedRW = [WriteDouble]; -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDouble], FPDPRounding = 1 in { defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>; -} // End SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble], FPDPRounding = 1 let SchedRW = [WriteQuarterRate32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -219,7 +219,8 @@ // VOP3 INTERP //===----------------------------------------------------------------------===// -class VOP3Interp : VOP3_Pseudo { +class VOP3Interp pattern = []> : + VOP3_Pseudo { let AsmMatchConverter = "cvtVOP3Interp"; } @@ -290,13 +291,13 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile, fmul, 1>; def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum, 1>; def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 let SchedRW = [WriteQuarterRate32] in { def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile>; @@ -323,6 +324,7 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, getVOP3VCC.ret> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } } // End Uses = [VCC, EXEC] @@ -353,10 +355,10 @@ def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile, AMDGPUdiv_fixup>; def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPUldexp, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; @@ -367,6 +369,7 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; let AsmMatchConverter = ""; + let FPDPRounding = 1; } def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile>; @@ -377,6 +380,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile, AMDGPUtrig_preop> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } let SchedRW = [Write64Bit] in { @@ -428,8 +432,17 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>; -def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} +let Uses = [M0, EXEC], FPDPRounding = 1 in { +def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, + [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp)))]>; +} // End Uses = [M0, EXEC], FPDPRounding = 1 +} // End renamedInGfx9 = 1 let SubtargetPredicate = isGFX9 in { def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile>; @@ -439,8 +452,24 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 -def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; -def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +let Uses = [M0, EXEC], FPDPRounding = 1 in { +def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, + [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i32 imm:$omod)))]>; +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, + [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan), + (i32 imm:$attr), + (i32 imm:$src0_modifiers), + (f32 VRegSrc_32:$src2), + (i32 imm:$src2_modifiers), + (i1 imm:$high), + (i1 imm:$clamp), + (i32 imm:$omod)))]>; +} // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.f16.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9_32BANK %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8_32BANK %s +; RUN: llc -march=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8_16BANK %s + +; GFX9_32BANK-LABEL: {{^}}interp_f16: +; GFX9_32BANK: s_mov_b32 m0, s{{[0-9]+}} +; GFX9_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y{{$}} +; GFX9_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y high +; GFX9_32BANK: v_interp_p2_legacy_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX9_32BANK: v_interp_p2_legacy_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high + +; GFX8_32BANK-LABEL: {{^}}interp_f16: +; GFX8_32BANK: s_mov_b32 m0, s{{[0-9]+}} +; GFX8_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y{{$}} +; GFX8_32BANK: v_interp_p1ll_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y high +; GFX8_32BANK: v_interp_p2_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX8_32BANK: v_interp_p2_f16{{(_e32)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high + +; GFX8_16BANK-LABEL: {{^}}interp_f16: +; GFX8_16BANK: s_mov_b32 m0, s{{[0-9]+}} +; there should be only one v_interp_mov +; GFX8_16BANK: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr2.y +; GFX8_16BANK-NOT: v_interp_mov_f32_e32 v{{[0-9]+}}, p0, attr2.y +; GFX8_16BANK: v_interp_p1lv_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX8_16BANK: v_interp_p1lv_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high +; GFX8_16BANK: v_interp_p2_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}}{{$}} +; GFX8_16BANK: v_interp_p2_f16{{(_e64)*}} v{{[0-9]+}}, v{{[0-9]+}}, attr2.y, v{{[0-9]*}} high + +define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +main_body: + %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0) + %p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0) + %p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 1, i32 %m0) + %p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 1, i32 %m0) + %res = fadd half %p2_0, %p2_1 + ret half %res +} + +; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0) +declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0 +; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0) +declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/mode-register.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/mode-register.mir @@ -0,0 +1,176 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s + +--- +# check that the mode is changed to rtz from default rtn for interp f16 +# CHECK-LABEL: name: interp_f16_default +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_default + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is not changed for interp f16 when the mode is already RTZ +# CHECK-LABEL: name: interp_f16_explicit_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_explicit_rtz + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is unchanged from RTN for F64 instruction +# CHECK-LABEL: name: rtn_default +# CHECK-LABEL: bb.0: +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_SQRT_F64 + +name: rtn_default + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# CHECK-LABEL: name: rtn_from_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_SQRT_F64 + +name: rtn_from_rtz + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# and back again for remaining interp instruction +# CHECK-LABEL: name: interp_f16_plus_sqrt_f64 +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: interp_f16_plus_sqrt_f64 + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that an explicit change to the single precision mode has no effect +# CHECK-LABEL: name: single_precision_mode_change +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: single_precision_mode_change + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 2, 2049 + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop - first instruction is RTN but needs +# setreg as RTZ is set in loop +# CHECK-LABEL: name: loop +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM +...