Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -57,6 +57,7 @@ FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createSIModeRegisterPass(); void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); @@ -179,6 +180,9 @@ void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; +void initializeSIModeRegisterPass(PassRegistry &); +extern char &SIModeRegisterID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -137,6 +137,13 @@ cl::init(true), cl::Hidden); +// Enable Mode register optimization +static cl::opt EnableSIModeRegisterPass( + "amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -172,6 +179,7 @@ initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitcntsPass(*PR); + initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); @@ -886,6 +894,7 @@ } void GCNPassConfig::addPreEmitPass() { + addPass(createSIModeRegisterPass()); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIShrinkInstructionsPass()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -116,6 +116,7 @@ SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + SIModeRegister.cpp ) add_subdirectory(AsmParser) Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -88,7 +88,10 @@ IsPacked = UINT64_C(1) << 49, // Is a D16 buffer instruction. - D16Buf = UINT64_C(1) << 50 + D16Buf = UINT64_C(1) << 50, + + // Uses floating point double precision rounding mode + FPDPRounding = UINT64_C(1) << 51 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -121,6 +121,10 @@ // This bit indicates that this is a D16 buffer instruction. field bit D16Buf = 0; + // This bit indicates that this uses the floating point double precision + // rounding mode flags + field bit FPDPRounding = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -178,6 +182,8 @@ let TSFlags{50} = D16Buf; + let TSFlags{51} = FPDPRounding; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -589,6 +589,14 @@ return MI.getDesc().TSFlags & ClampFlags; } + static bool usesFPDPRounding(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; + } + + bool usesFPDPRounding(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); Index: lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIModeRegister.cpp @@ -0,0 +1,372 @@ +//===-- SIModeRegister.cpp - Mode Register --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass inserts changes to the Mode register settings as required. +/// Note that currently it only deals with the Double Precision Floating Point +/// rounding mode setting, but is intended to be generic enough to be easily +/// expanded. +/// +//===----------------------------------------------------------------------===// +// +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include + +#define DEBUG_TYPE "si-mode-register" + +STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted."); + +using namespace llvm; + +struct Status { + // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a + // known value + unsigned Mask; + unsigned Mode; + + Status() : Mask(0), Mode(0){}; + Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Mode &= Mask; + }; + + // merge two status values such that only values that don't conflict are + // preserved + Status merge(const Status &S) const { + return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); + } + + // merge an unknown value by using the unknown value's mask to remove bits + // from the result + Status mergeUnknown(unsigned newMask) { + return Status(Mask & ~newMask, Mode & ~newMask); + } + + // intersect two Status values to produce a mode and mask that is a subset + // of both values + Status intersect(const Status &S) const { + unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); + unsigned NewMode = (Mode & NewMask); + return Status(NewMask, NewMode); + } + + // produce the delta required to change the Mode to the required Mode + Status delta(const Status &S) const { + return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); + } + + bool operator==(const Status &S) const { + return (Mask == S.Mask) && (Mode == S.Mode); + } + + bool operator!=(const Status &S) const { return !(*this == S); } + + bool isCompatible(Status &S) { + return (!((Mask & S.Mask) & (Mode ^ S.Mode))); + } + + bool isSubset(Status &S) { + return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); + } +}; + +class BlockData { +public: + // The Status that represents the mode register settings required on entry to + // this block. Calculated in Phase 1. + Status Require; + + // The Status that represents the net changes to the Mode register made by + // this block, Calculated in Phase 1. + Status Change; + + // The Status that represents the mode register settings on exit from this + // block. Calculated in Phase 2. + Status Exit; + + // The Status that represents the intersection of exit Mode register settings + // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. + Status Pred; +}; + +namespace { + +class SIModeRegister : public MachineFunctionPass { +public: + static char ID; + + std::vector BlockInfo; + std::queue Phase2List; + + // The default mode register setting currently only caters for the floating + // point double precision rounding mode. + // We currently assume the default rounding mode is Round to Nearest + // NOTE: this should come from a per function rounding mode setting once such + // a setting exists. + unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; + Status DefaultStatus = + Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); + +public: + SIModeRegister() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); + + void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, + const SIInstrInfo *TII, Status InstrMode); +}; +} // End anonymous namespace. + +INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, + "Insert required mode register values", false, false) + +char SIModeRegister::ID = 0; + +char &llvm::SIModeRegisterID = SIModeRegister::ID; + +FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } + +// Determine the Mode register setting required for this instruction. +// Instructions which don't use the Mode register return a null Status. +// Note this currently only deals with instructions that use the floating point +// double precision setting. +Status SIModeRegister::getInstructionMode(MachineInstr &MI, + const SIInstrInfo *TII) { + if (TII->usesFPDPRounding(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P2_F16: + // f16 interpolation instructions need double precision round to zero + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + default: + return DefaultStatus; + } + } + return Status(); +} + +// Insert a setreg instruction to update the Mode register. +// It is possible (though unlikely) for an instruction to require a change to +// the value of disjoint parts of the Mode register when we don't know the +// value of the intervening bits. In that case we need to use more than one +// setreg instruction. +void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, + const SIInstrInfo *TII, Status InstrMode) { + while (InstrMode.Mask) { + unsigned Offset = countTrailingZeros(InstrMode.Mask); + unsigned Width = countTrailingOnes(InstrMode.Mask >> Offset); + unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(Value) + .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | + (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + ++NumSetregInserted; + InstrMode.Mask &= ~((1 << Width) - 1) << Offset; + } +} + +// In Phase 1 we iterate through the instructions of the block and for each +// instruction we get its mode usage. If the instruction uses the Mode register +// we: +// - update the Change status, which tracks the changes to the Mode register +// made by this block +// - if this instruction's requirements are compatible with the current setting +// of the Mode register we merge the modes +// - if it isn't compatible and an InsertionPoint isn't set, then we set the +// InsertionPoint to the current instruction, and we remember the current +// mode +// - if it isn't compatible and InsertionPoint is set we insert a seteg before +// the current instruction (unless this instruction forms part of the block's +// entry requirements in which case the insertion is defered until Phase 3 +// when predessor exit values are known), and move the insertion point to this +// instruction +// - if this is a setreg instruction we treat it as an incompatible instruction. +// This is sub-optimal but avoids some nasty corner cases, and is expected to +// occur very rarely. +// - on exit we have set the Require, Change, and initial Exit modes. +void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + BlockData *NewInfo = new BlockData; + MachineInstr *InsertionPoint = nullptr; + bool RequirePending = true; + Status IPChange; + for (MachineInstr &MI : MBB) { + Status InstrMode = getInstructionMode(MI, TII); + if (InstrMode != Status()) { + // This instruction uses the Mode register + if (NewInfo->Change.isCompatible(InstrMode)) { + // This instruction is compatible, so just merge the mode + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } else if (InsertionPoint) { + // We need a setreg at the InsertionPoint, but we defer the first in + // each block to Phase 3 + if (RequirePending) { + NewInfo->Require = NewInfo->Change; + RequirePending = false; + } else { + insertSetreg(MBB, InsertionPoint, TII, + IPChange.delta(NewInfo->Change)); + } + // Set a new InsertionPoint + InsertionPoint = &MI; + IPChange = NewInfo->Change; + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } + if (!InsertionPoint) { + InsertionPoint = &MI; + IPChange = NewInfo->Change; + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } + } else if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || + (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != + AMDGPU::Hwreg::ID_MODE) + continue; + + unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> + AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + + 1; + unsigned Offset = + (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; + unsigned Mask = ((1 << Width) - 1) << Offset; + + if (RequirePending) { + NewInfo->Require = NewInfo->Change; + RequirePending = false; + } else if (InsertionPoint) { + // We need to insert a setreg at the InsertionPoint + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + InsertionPoint = nullptr; + } + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); + unsigned Mode = (Val << Offset) & Mask; + Status Setreg = Status(Mask, Mode); + NewInfo->Change = NewInfo->Change.merge(Setreg); + } else { + NewInfo->Change = NewInfo->Change.mergeUnknown(Mask); + } + } + } + if (RequirePending) { + NewInfo->Require = NewInfo->Change; + } else if (InsertionPoint) { + // We need to insert a setreg at the InsertionPoint + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + } + NewInfo->Exit = NewInfo->Change; + BlockInfo[MBB.getNumber()] = NewInfo; + Phase2List.push(&MBB); +} + +// In Phase 2 we revisit each block and calculate the common Mode register +// value provided by all predecessor blocks. If the Exit value for the block +// is changed, then we add the successor blocks to the worklist so that the +// exit value is propagated. +void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + BlockData *BI = BlockInfo[MBB.getNumber()]; + if (MBB.pred_empty()) { + // There are no predecessors, so use the default starting status. + BI->Pred = DefaultStatus; + } else { + // Build a status that is common to all the predecessors by intersecting + // all the predecessor exit status values. + MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); + MachineBasicBlock &PB = *(*P); + BI->Pred = BlockInfo[PB.getNumber()]->Exit; + + for (P = std::next(P); P != E; P = std::next(P)) { + MachineBasicBlock *Pred = *P; + BlockData *PI = BlockInfo[Pred->getNumber()]; + BI->Pred = BI->Pred.intersect(PI->Exit); + } + } + Status TmpStatus = BI->Pred.merge(BI->Change); + if (BI->Exit != TmpStatus) { + BI->Exit = TmpStatus; + // Add the successors to the work list so we can propagate the changed exit + // status. + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); + S != E; S = std::next(S)) { + MachineBasicBlock &B = *(*S); + Phase2List.push(&B); + } + } +} + +// In Phase 3 we revisit each block and if it has an insertion point defined we +// check whether the predecessor mode meets the block's entry requirements. If +// not we insert an appropriate setreg instruction to modify the Mode register. +void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + BlockData *BI = BlockInfo[MBB.getNumber()]; + if (!BI->Pred.isSubset(BI->Require)) { + Status Delta = BI->Pred.delta(BI->Require); + insertSetreg(MBB, &MBB.instr_front(), TII, Delta); + } +} + +bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { + BlockInfo.resize(MF.getNumBlockIDs()); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Processing is performed in a number of phases + + // Phase 1 - determine the mode required at entry to each block, and add + // setreg instructions for intra block requirements + for (MachineBasicBlock &BB : MF) + processBlockPhase1(BB, TII); + + // Phase 2 - determine the exit mode from each block + while (!Phase2List.empty()) { + processBlockPhase2(*Phase2List.front(), TII); + Phase2List.pop(); + } + + // Phase 3 - add setreg to the start of each block where the required entry + // mode is not satisfied by the exit mode of all its predecessors. + for (MachineBasicBlock &BB : MF) + processBlockPhase3(BB, TII); + + BlockInfo.clear(); + + return NumSetregInserted > 0; +} Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -203,14 +203,14 @@ defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; } // End SchedRW = [WriteQuarterRate32] -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDouble], FPDPRounding = 1 in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; -} // End SchedRW = [WriteDouble]; +} // End SchedRW = [WriteDouble], FPDPRounding = 1 -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDouble], FPDPRounding = 1 in { defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>; -} // End SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble], FPDPRounding = 1 let SchedRW = [WriteQuarterRate32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -290,13 +290,13 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile, fmul, 1>; def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum, 1>; def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 let SchedRW = [WriteQuarterRate32] in { def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile>; @@ -323,6 +323,7 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, getVOP3VCC.ret> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } } // End Uses = [VCC, EXEC] @@ -353,14 +354,15 @@ def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile, AMDGPUdiv_fixup>; def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPUldexp, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdda], FPDPRounding = 1 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; let AsmMatchConverter = ""; + let FPDPRounding = 1; } // Double precision division pre-scale. @@ -377,6 +379,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile, AMDGPUtrig_preop> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } let SchedRW = [Write64Bit] in { @@ -428,7 +431,9 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>; +let FPDPRounding = 1 in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; +} // End FPDPRounding = 1 } let SubtargetPredicate = isGFX9 in { @@ -439,8 +444,10 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 +let FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +} // End FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 Index: test/CodeGen/AMDGPU/mode-register.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/mode-register.mir @@ -0,0 +1,427 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s + +--- +# check that the mode is changed to rtz from default rtn for interp f16 +# CHECK-LABEL: name: interp_f16_default +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_default + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is not changed for interp f16 when the mode is already RTZ +# CHECK-LABEL: name: interp_f16_explicit_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_explicit_rtz + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is unchanged from RTN for F64 instruction +# CHECK-LABEL: name: rtn_default +# CHECK-LABEL: bb.0: +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_SQRT_F64 + +name: rtn_default + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# CHECK-LABEL: name: rtn_from_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_SQRT_F64 + +name: rtn_from_rtz + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# CHECK-LABEL: name: rtz_from_rtn +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtz_from_rtn + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# and back again for remaining interp instruction +# CHECK-LABEL: name: interp_f16_plus_sqrt_f64 +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: interp_f16_plus_sqrt_f64 + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that an explicit change to the single precision mode has no effect +# CHECK-LABEL: name: single_precision_mode_change +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: single_precision_mode_change + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 2, 2049 + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop - first instruction is RTN but needs +# setreg as RTZ is set in loop +# CHECK-LABEL: name: loop +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM +... +--- +# two back-edges to same node with different modes +# CHECK-LABEL: name: double_loop +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64_e32 +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 3, 2177 + +name: double_loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + S_NOP 1 + S_BRANCH %bb.5 + + bb.5: + successors: %bb.1, %bb.6 + S_SETREG_IMM32_B32 3, 2177 + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.6 + + bb.6: + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop and through a block that +# neither sets or uses the mode. +# CHECK-LABEL: name: loop_indirect +# CHECK_NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_NOP 1 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.1, %bb.4 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated to a block that uses the mode +# CHECK-LABEL: name: multiple_mode_direct +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_direct + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated through a block that neither +# sets or uses the mode. +# CHECK-LABEL: name: multiple_mode_indirect +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.5 + + bb.5: + S_ENDPGM +... +--- +# CHECK-LABEL: name: pass_through_blocks +# CHECK-LABEL: bb.0: +# CHECK: V_SQRT_F64_e32 +# CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: pass_through_blocks + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_SQRT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_BRANCH %bb.4 + + bb.4: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that multiple mode values are propagated +# CHECK-LABEL: name: if_then_else +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_SQRT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: if_then_else + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_SQRT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +...