Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -58,6 +58,7 @@ FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createSIModeRegisterPass(); void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); @@ -191,6 +192,9 @@ void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; +void initializeSIModeRegisterPass(PassRegistry&); +extern char &SIModeRegisterID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -145,6 +145,13 @@ cl::init(false), cl::Hidden); +// Enable Mode register optimization +static cl::opt EnableSIModeRegisterPass( + "amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -183,6 +190,7 @@ initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitcntsPass(*PR); + initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); @@ -892,6 +900,7 @@ addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIShrinkInstructionsPass()); + addPass(createSIModeRegisterPass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -119,6 +119,7 @@ SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + SIModeRegister.cpp ) add_subdirectory(AsmParser) Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -88,7 +88,10 @@ IsPacked = UINT64_C(1) << 49, // Is a D16 buffer instruction. - D16Buf = UINT64_C(1) << 50 + D16Buf = UINT64_C(1) << 50, + + // Uses floating point double precision rounding mode + FPDPRounding = UINT64_C(1) << 51 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -121,6 +121,10 @@ // This bit indicates that this is a D16 buffer instruction. field bit D16Buf = 0; + // This bit indicates that this uses the floating point double precision + // rounding mode flags + field bit FPDPRounding = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -178,6 +182,8 @@ let TSFlags{50} = D16Buf; + let TSFlags{51} = FPDPRounding; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -603,6 +603,14 @@ return MI.getDesc().TSFlags & ClampFlags; } + static bool usesFPDPRounding(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; + } + + bool usesFPDPRounding(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); Index: lib/Target/AMDGPU/SIModeRegister.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIModeRegister.cpp @@ -0,0 +1,406 @@ +//===-- SIModeRegister.cpp - Mode Register --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass inserts changes to the Mode register settings as required. +/// Note that currently it only deals with the Double Precision Floating Point +/// rounding mode setting, but is intended to be generic enough to be easily +/// expanded. +/// +//===----------------------------------------------------------------------===// +// +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include + +#define DEBUG_TYPE "si-mode-register" + +STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted."); + +using namespace llvm; + +struct Status { + // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a + // known value + unsigned Mask; + unsigned Mode; + + Status() : Mask(0), Mode(0){}; + + Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Mode &= Mask; + }; + + // merge two status values such that only values that don't conflict are + // preserved + Status merge(const Status &S) const { + return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); + } + + // merge an unknown value by using the unknown value's mask to remove bits + // from the result + Status mergeUnknown(unsigned newMask) { + return Status(Mask & ~newMask, Mode & ~newMask); + } + + // intersect two Status values to produce a mode and mask that is a subset + // of both values + Status intersect(const Status &S) const { + unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); + unsigned NewMode = (Mode & NewMask); + return Status(NewMask, NewMode); + } + + // produce the delta required to change the Mode to the required Mode + Status delta(const Status &S) const { + return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); + } + + bool operator==(const Status &S) const { + return (Mask == S.Mask) && (Mode == S.Mode); + } + + bool operator!=(const Status &S) const { return !(*this == S); } + + bool isCompatible(Status &S) { + return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); + } + + bool isCombinable(Status &S) { + return !(Mask & S.Mask) || isCompatible(S); + } +}; + +class BlockData { +public: + // The Status that represents the mode register settings required by the + // FirstInsertionPoint (if any) in this block. Calculated in Phase 1. + Status Require; + + // The Status that represents the net changes to the Mode register made by + // this block, Calculated in Phase 1. + Status Change; + + // The Status that represents the mode register settings on exit from this + // block. Calculated in Phase 2. + Status Exit; + + // The Status that represents the intersection of exit Mode register settings + // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. + Status Pred; + + // In Phase 1 we record the first instruction that has a mode requirement, + // which is used in Phase 3 if we need to insert a mode change. + MachineInstr *FirstInsertionPoint; + + BlockData() : FirstInsertionPoint(nullptr) {}; +}; + +namespace { + +class SIModeRegister : public MachineFunctionPass { +public: + static char ID; + + std::vector> BlockInfo; + std::queue Phase2List; + + // The default mode register setting currently only caters for the floating + // point double precision rounding mode. + // We currently assume the default rounding mode is Round to Nearest + // NOTE: this should come from a per function rounding mode setting once such + // a setting exists. + unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; + Status DefaultStatus = + Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); + +public: + SIModeRegister() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); + + void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, + const SIInstrInfo *TII, Status InstrMode); +}; +} // End anonymous namespace. + +INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, + "Insert required mode register values", false, false) + +char SIModeRegister::ID = 0; + +char &llvm::SIModeRegisterID = SIModeRegister::ID; + +FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } + +// Determine the Mode register setting required for this instruction. +// Instructions which don't use the Mode register return a null Status. +// Note this currently only deals with instructions that use the floating point +// double precision setting. +Status SIModeRegister::getInstructionMode(MachineInstr &MI, + const SIInstrInfo *TII) { + if (TII->usesFPDPRounding(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P2_F16: + // f16 interpolation instructions need double precision round to zero + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + default: + return DefaultStatus; + } + } + return Status(); +} + +// Insert a setreg instruction to update the Mode register. +// It is possible (though unlikely) for an instruction to require a change to +// the value of disjoint parts of the Mode register when we don't know the +// value of the intervening bits. In that case we need to use more than one +// setreg instruction. +void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, + const SIInstrInfo *TII, Status InstrMode) { + while (InstrMode.Mask) { + unsigned Offset = countTrailingZeros(InstrMode.Mask); + unsigned Width = countTrailingOnes(InstrMode.Mask >> Offset); + unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(Value) + .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | + (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + ++NumSetregInserted; + InstrMode.Mask &= ~((1 << Width) - 1) << Offset; + } +} + +// In Phase 1 we iterate through the instructions of the block and for each +// instruction we get its mode usage. If the instruction uses the Mode register +// we: +// - update the Change status, which tracks the changes to the Mode register +// made by this block +// - if this instruction's requirements are compatible with the current setting +// of the Mode register we merge the modes +// - if it isn't compatible and an InsertionPoint isn't set, then we set the +// InsertionPoint to the current instruction, and we remember the current +// mode +// - if it isn't compatible and InsertionPoint is set we insert a seteg before +// that instruction (unless this instruction forms part of the block's +// entry requirements in which case the insertion is deferred until Phase 3 +// when predecessor exit values are known), and move the insertion point to +// this instruction +// - if this is a setreg instruction we treat it as an incompatible instruction. +// This is sub-optimal but avoids some nasty corner cases, and is expected to +// occur very rarely. +// - on exit we have set the Require, Change, and initial Exit modes. +void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + auto NewInfo = llvm::make_unique(); + MachineInstr *InsertionPoint = nullptr; + // RequirePending is used to indicate whether we are collecting the initial + // requirements for the block, and need to defer the first InsertionPoint to + // Phase 3. It is set to false once we have set FirstInsertionPoint, or when + // we discover an explict setreg that means this block doesn't have any + // initial requirements. + bool RequirePending = true; + Status IPChange; + for (MachineInstr &MI : MBB) { + Status InstrMode = getInstructionMode(MI, TII); + if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || + (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + // We preserve any explicit mode register setreg instruction we encounter, + // as we assume it has been inserted by a higher authority (this is + // likely to be a very rare occurrence). + unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != + AMDGPU::Hwreg::ID_MODE) + continue; + + unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> + AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + + 1; + unsigned Offset = + (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; + unsigned Mask = ((1 << Width) - 1) << Offset; + + // If an InsertionPoint is set we will insert a setreg there. + if (InsertionPoint) { + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + InsertionPoint = nullptr; + } + // If this is an immediate then we know the value being set, but if it is + // not an immediate then we treat the modified bits of the mode register + // as unknown. + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); + unsigned Mode = (Val << Offset) & Mask; + Status Setreg = Status(Mask, Mode); + // If we haven't already set the initial requirements for the block we + // don't need to as the requirements start from this explicit setreg. + RequirePending = false; + NewInfo->Change = NewInfo->Change.merge(Setreg); + } else { + NewInfo->Change = NewInfo->Change.mergeUnknown(Mask); + } + } else if (!NewInfo->Change.isCompatible(InstrMode)) { + // This instruction uses the Mode register and its requirements aren't + // compatible with the current mode. + if (InsertionPoint) { + // If the required mode change cannot be included in the current + // InsertionPoint changes, we need a setreg and start a new + // InsertionPoint. + if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) { + if (RequirePending) { + // This is the first insertionPoint in the block so we will defer + // the insertion of the setreg to Phase 3 where we know whether or + // not it is actually needed. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + RequirePending = false; + } else { + insertSetreg(MBB, InsertionPoint, TII, + IPChange.delta(NewInfo->Change)); + IPChange = NewInfo->Change; + } + // Set the new InsertionPoint + InsertionPoint = &MI; + } + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } else { + // No InsertionPoint is currently set - this is either the first in + // the block or we have previously seen an explicit setreg. + InsertionPoint = &MI; + IPChange = NewInfo->Change; + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } + } + } + if (RequirePending) { + // If we haven't yet set the initial requirements for the block we set them + // now. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + } else if (InsertionPoint) { + // We need to insert a setreg at the InsertionPoint + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + } + NewInfo->Exit = NewInfo->Change; + BlockInfo[MBB.getNumber()] = std::move(NewInfo); +} + +// In Phase 2 we revisit each block and calculate the common Mode register +// value provided by all predecessor blocks. If the Exit value for the block +// is changed, then we add the successor blocks to the worklist so that the +// exit value is propagated. +void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (MBB.pred_empty()) { + // There are no predecessors, so use the default starting status. + BlockInfo[ThisBlock]->Pred = DefaultStatus; + } else { + // Build a status that is common to all the predecessors by intersecting + // all the predecessor exit status values. + MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); + MachineBasicBlock &PB = *(*P); + BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit; + + for (P = std::next(P); P != E; P = std::next(P)) { + MachineBasicBlock *Pred = *P; + BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit); + } + } + Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); + if (BlockInfo[ThisBlock]->Exit != TmpStatus) { + BlockInfo[ThisBlock]->Exit = TmpStatus; + // Add the successors to the work list so we can propagate the changed exit + // status. + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); + S != E; S = std::next(S)) { + MachineBasicBlock &B = *(*S); + Phase2List.push(&B); + } + } +} + +// In Phase 3 we revisit each block and if it has an insertion point defined we +// check whether the predecessor mode meets the block's entry requirements. If +// not we insert an appropriate setreg instruction to modify the Mode register. +void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) { + Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); + if (BlockInfo[ThisBlock]->FirstInsertionPoint) + insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta); + else + insertSetreg(MBB, &MBB.instr_front(), TII, Delta); + } +} + +bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { + BlockInfo.resize(MF.getNumBlockIDs()); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Processing is performed in a number of phases + + // Phase 1 - determine the initial mode required by each block, and add setreg + // instructions for intra block requirements. + for (MachineBasicBlock &BB : MF) + processBlockPhase1(BB, TII); + + // Phase 2 - determine the exit mode from each block. We add all blocks to the + // list here, but will also add any that need to be revisited during Phase 2 + // processing. + for (MachineBasicBlock &BB : MF) + Phase2List.push(&BB); + while (!Phase2List.empty()) { + processBlockPhase2(*Phase2List.front(), TII); + Phase2List.pop(); + } + + // Phase 3 - add an initial setreg to each block where the required entry mode + // is not satisfied by the exit mode of all its predecessors. + for (MachineBasicBlock &BB : MF) + processBlockPhase3(BB, TII); + + BlockInfo.clear(); + + return NumSetregInserted > 0; +} Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -173,7 +173,9 @@ defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; +let FPDPRounding = 1 in { defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +} // End FPDPRounding = 1 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; @@ -226,7 +228,9 @@ let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; +let FPDPRounding = 1 in { defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; +} // End FPDPRounding = 1 } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; @@ -333,8 +337,10 @@ let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; +} // End FPDPRounding = 1 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; let SchedRW = [WriteQuarterRate32] in { @@ -352,7 +358,9 @@ defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +} // End FPDPRounding = 1 } Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -540,18 +540,23 @@ let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; +} // End FPDPRounding = 1 + defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; let isCommutable = 1 in { +let FPDPRounding = 1 in { defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; +} // End FPDPRounding = 1 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -220,7 +220,8 @@ // VOP3 INTERP //===----------------------------------------------------------------------===// -class VOP3Interp : VOP3_Pseudo { +class VOP3Interp pattern = []> : + VOP3_Pseudo { let AsmMatchConverter = "cvtVOP3Interp"; } @@ -292,9 +293,11 @@ def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile, int_amdgcn_lerp>; let SchedRW = [WriteDoubleAdd] in { +let FPDPRounding = 1 in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile, fmul, 1>; +} // End FPDPRounding = 1 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum_like, 1>; def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] @@ -324,6 +327,7 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, getVOP3VCC.ret> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } } // End Uses = [VCC, EXEC] @@ -354,10 +358,10 @@ def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile, AMDGPUdiv_fixup>; def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPUldexp, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; @@ -368,6 +372,7 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; let AsmMatchConverter = ""; + let FPDPRounding = 1; } def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile>; @@ -431,39 +436,51 @@ def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup> { let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; } def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile, AMDGPUdiv_fixup> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; } def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma> { let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; } def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile, fma> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; } let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in { let renamedInGFX9 = 1 in { -def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; +let FPDPRounding = 1 in { +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; +let Uses = [M0, EXEC] in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} +} // End Uses = [M0, EXEC] +} // End FPDPRounding = 1 +} // End renamedInGFX9 = 1 let SubtargetPredicate = isGFX9 in { -def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile>; +def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile> { + let FPDPRounding = 1; +} def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile>; def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile>; def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 +let Uses = [M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +} // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -798,12 +815,15 @@ defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>; defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>; +let FPDPRounding = 1 in { defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">; -defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; -defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">; defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">; defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">; +} // End FPDPRounding = 1 + +defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; +defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -42,12 +42,14 @@ } let isCommutable = 1 in { -def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, fma>; def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile>; def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile>; +let FPDPRounding = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, fma>; def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, fmul>; +} // End FPDPRounding = 1 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; @@ -137,12 +139,14 @@ let isCommutable = 1 in { def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile, 1>; let ClampLo = 0, ClampHi = 1 in { def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats; @@ -154,12 +158,14 @@ let isCommutable = 1 in { def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile, 1>; let ClampLo = 0, ClampHi = 1 in { def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats; Index: test/CodeGen/AMDGPU/mode-register.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/mode-register.mir @@ -0,0 +1,459 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s + +--- +# check that the mode is changed to rtz from default rtn for interp f16 +# CHECK-LABEL: name: interp_f16_default +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_INTERP_P1LL_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_default + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is not changed for interp f16 when the mode is already RTZ +# CHECK-LABEL: name: interp_f16_explicit_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_MOV_B32_e32 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: interp_f16_explicit_rtz + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that explicit RTN mode change is registered +# CHECK-LABEL: name: explicit_rtn +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: V_INTERP_P1LL_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_ADD_F16_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: explicit_rtn + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 0, 2177 + $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is unchanged from RTN for F64 instruction +# CHECK-LABEL: name: rtn_default +# CHECK-LABEL: bb.0: +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK: V_FRACT_F64 + +name: rtn_default + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# CHECK-LABEL: name: rtn_from_rtz +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177 +# CHECK-NEXT: V_FRACT_F64 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtn_from_rtz + +body: | + bb.0: + liveins: $vgpr1_vgpr2 + S_SETREG_IMM32_B32 3, 2177 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_ENDPGM +... +--- +# CHECK-LABEL: name: rtz_from_rtn +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: rtz_from_rtn + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that the mode is changed from RTZ to RTN for F64 instruction +# and back again for remaining interp instruction +# CHECK-LABEL: name: interp_f16_plus_sqrt_f64 +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: interp_f16_plus_sqrt_f64 + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that an explicit change to the single precision mode has no effect +# CHECK-LABEL: name: single_precision_mode_change +# CHECK-LABEL: bb.0: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P1LL_F16 +# CHECK: V_INTERP_P2_F16 +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P2_F16 + +name: single_precision_mode_change + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + $m0 = S_MOV_B32 killed $sgpr2 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_SETREG_IMM32_B32 2, 2049 + $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec + $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec + $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec + $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop - first instruction is RTN but needs +# setreg as RTZ is set in loop +# CHECK-LABEL: name: loop +# CHECK-LABEL: bb.1: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM +... +--- +# two back-edges to same node with different modes +# CHECK-LABEL: name: double_loop +# CHECK-NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.2: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 3, 2177 + +name: double_loop + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.1, %bb.3 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + S_NOP 1 + S_BRANCH %bb.5 + + bb.5: + successors: %bb.1, %bb.6 + S_SETREG_IMM32_B32 3, 2177 + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.6 + + bb.6: + S_ENDPGM +... +--- +# check that mode is propagated back to start of loop and through a block that +# neither sets or uses the mode. +# CHECK-LABEL: name: loop_indirect +# CHECK_NOT: S_SETREG_IMM32_B32 +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 3, 2177 +# CHECK: V_INTERP_P1LL_F16 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: loop_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_NOP 1 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_NOP 1 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.1, %bb.4 + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated to a block that uses the mode +# CHECK-LABEL: name: multiple_mode_direct +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_direct + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +... +--- +# check that multiple mode values are propagated through a block that neither +# sets or uses the mode. +# CHECK-LABEL: name: multiple_mode_indirect +# CHECK-LABEL: bb.4: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: multiple_mode_indirect + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.2, implicit $vcc + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_NOP 1 + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.5 + + bb.5: + S_ENDPGM +... +--- +# CHECK-LABEL: name: pass_through_blocks +# CHECK-LABEL: bb.0: +# CHECK: V_FRACT_F64_e32 +# CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: pass_through_blocks + +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr1_vgpr2 + $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + S_BRANCH %bb.4 + + bb.4: + $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec + S_ENDPGM +... +--- +# check that multiple mode values are propagated +# CHECK-LABEL: name: if_then_else +# CHECK-LABEL: bb.3: +# CHECK: S_SETREG_IMM32_B32 0, 2177 +# CHECK: V_FRACT_F64_e32 +# CHECK-NOT: S_SETREG_IMM32_B32 + +name: if_then_else + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4 + successors: %bb.1 + $m0 = S_MOV_B32 killed $sgpr2 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + S_CBRANCH_VCCZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + S_SETREG_IMM32_B32 3, 2177 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM +...