Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -556,6 +556,10 @@ return SGPRInitBug; } + bool has12DWordStoreHazard() const { + return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; + } + unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs Index: llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td @@ -1164,6 +1164,7 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>; +// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI. //defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI //defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI //defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI Index: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -35,6 +35,7 @@ const MachineFunction &MF; const SISubtarget &ST; + int getWaitStatesSince(function_ref IsHazard); int getWaitStatesSinceDef(unsigned Reg, function_ref IsHazardDef = [](MachineInstr *) { return true; }); @@ -47,6 +48,8 @@ int checkDivFMasHazards(MachineInstr *DivFMas); int checkGetRegHazards(MachineInstr *GetRegInstr); int checkSetRegHazards(MachineInstr *SetRegInstr); + int createsVALUHazard(const MachineInstr &MI); + int checkVALUHazards(MachineInstr *VALU); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. Index: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -67,6 +67,9 @@ if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) + return NoopHazard; + if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) return NoopHazard; @@ -90,14 +93,20 @@ if (SIInstrInfo::isSMRD(*MI)) return std::max(0, checkSMRDHazards(MI)); - if (SIInstrInfo::isVMEM(*MI)) - return std::max(0, checkVMEMHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) { + int WaitStates = std::max(0, checkVALUHazards(MI)); - if (SIInstrInfo::isDPP(*MI)) - return std::max(0, checkDPPHazards(MI)); + if (SIInstrInfo::isVMEM(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); - if (isDivFMas(MI->getOpcode())) - return std::max(0, checkDivFMasHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) + WaitStates = std::max(WaitStates, checkDPPHazards(MI)); + + if (isDivFMas(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); + + return WaitStates; + } if (isSGetReg(MI->getOpcode())) return std::max(0, checkGetRegHazards(MI)); @@ -149,32 +158,38 @@ // Helper Functions //===----------------------------------------------------------------------===// -int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref IsHazardDef) { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +int GCNHazardRecognizer::getWaitStatesSince( + function_ref IsHazard) { int WaitStates = -1; for (MachineInstr *MI : EmittedInstrs) { ++WaitStates; - if (!MI || !IsHazardDef(MI)) + if (!MI || !IsHazard(MI)) continue; - if (MI->modifiesRegister(Reg, TRI)) - return WaitStates; + return WaitStates; } return std::numeric_limits::max(); } +int GCNHazardRecognizer::getWaitStatesSinceDef( + unsigned Reg, function_ref IsHazardDef) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { + return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); + }; + + return getWaitStatesSince(IsHazardFn); +} + int GCNHazardRecognizer::getWaitStatesSinceSetReg( function_ref IsHazard) { - int WaitStates = -1; - for (MachineInstr *MI : EmittedInstrs) { - ++WaitStates; - if (!MI || !isSSetReg(MI->getOpcode()) || !IsHazard(MI)) - continue; - return WaitStates; - } - return std::numeric_limits::max(); + auto IsHazardFn = [IsHazard] (MachineInstr *MI) { + return isSSetReg(MI->getOpcode()) && IsHazard(MI); + }; + + return getWaitStatesSince(IsHazardFn); } //===----------------------------------------------------------------------===// @@ -350,3 +365,75 @@ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); return SetRegWaitStates - WaitStatesNeeded; } + +int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { + if (!MI.mayStore()) + return -1; + + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + + int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); + int VDataRCID = -1; + if (VDataIdx != -1) + VDataRCID = Desc.OpInfo[VDataIdx].RegClass; + + if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { + // For MUBUF/MTBUF instructions this hazard only exists if the + // instruction is not using a register in the soffset field. + const MachineOperand *SOffset = + TII->getNamedOperand(MI, AMDGPU::OpName::soffset); + // If we have no soffset operand, then assume this field has been + // hardcoded to zero. + if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && + (!SOffset || !SOffset->isReg())) + return VDataIdx; + } + + // MIMG instructions create a hazard if they don't use a 256-bit T# and + // the store size is greater than 8 bytes and they have more than two bits + // of their dmask set. + // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. + if (TII->isMIMG(MI)) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); + assert(SRsrcIdx != -1 && + AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); + } + + if (TII->isFLAT(MI)) { + int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::data); + if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) + return DataIdx; + } + + return -1; +} + +int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { + // This checks for the hazard where VMEM instructions that store more than + // 8 bytes can have there store data over written by the next instruction. + if (!ST.has12DWordStoreHazard()) + return 0; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo(); + + const int VALUWaitStates = 1; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Def : VALU->defs()) { + if (!TRI->isVGPR(MRI, Def.getReg())) + continue; + unsigned Reg = Def.getReg(); + auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { + int DataIdx = createsVALUHazard(*MI); + return DataIdx >= 0 && + TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); + }; + int WaitStatesNeededForDef = + VALUWaitStates - getWaitStatesSince(IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + return WaitStatesNeeded; +} Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -158,6 +158,9 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo); /// \brief Get the size in bits of a register from the register class \p RC. +unsigned getRegBitWidth(unsigned RCID); + +/// \brief Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); /// \brief Get size of register operand Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -352,8 +352,8 @@ // Avoid using MCRegisterClass::getSize, since that function will go away // (move from MC* level to Target* level). Return size in bits. -unsigned getRegBitWidth(const MCRegisterClass &RC) { - switch (RC.getID()) { +unsigned getRegBitWidth(unsigned RCID) { + switch (RCID) { case AMDGPU::SGPR_32RegClassID: case AMDGPU::VGPR_32RegClassID: case AMDGPU::VS_32RegClassID: @@ -382,6 +382,10 @@ } } +unsigned getRegBitWidth(const MCRegisterClass &RC) { + return getRegBitWidth(RC.getID()); +} + unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo) { unsigned RCID = Desc.OpInfo[OpNo].RegClass; Index: llvm/trunk/test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir =================================================================== --- llvm/trunk/test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir +++ llvm/trunk/test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir @@ -1,11 +1,12 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN -# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,VI +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI --- | define void @div_fmas() { ret void } define void @s_getreg() { ret void } define void @s_setreg() { ret void } + define void @vmem_gt_8dw_store() { ret void } ... --- # GCN-LABEL: name: div_fmas @@ -159,3 +160,77 @@ S_SETREG_B32 %sgpr1, 0 S_ENDPGM ... + +... +--- +# GCN-LABEL: name: vmem_gt_8dw_store + +# GCN-LABEL: bb.0: +# GCN: BUFFER_STORE_DWORD_OFFSET +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_DWORDX3_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_DWORDX4_OFFSET +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_DWORDX4_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_FORMAT_XYZ_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 + +# GCN-LABEL: bb.1: +# GCN: FLAT_STORE_DWORDX2 +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_STORE_DWORDX3 +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_STORE_DWORDX4 +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_ATOMIC_CMPSWAP_X2 +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_ATOMIC_FCMPSWAP_X2 +# CIVI: S_NOP +# GCN: V_MOV_B32 + +name: vmem_gt_8dw_store + +body: | + bb.0: + successors: %bb.1 + BUFFER_STORE_DWORD_OFFSET %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_DWORDX3_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_FORMAT_XYZ_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_FORMAT_XYZW_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_ATOMIC_CMPSWAP_X2_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + S_BRANCH %bb.1 + + bb.1: + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + S_ENDPGM + +...