Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -549,6 +549,10 @@ return SGPRInitBug; } + bool has12DWordStoreHazard() const { + return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; + } + unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1164,6 +1164,7 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>; +// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI. //defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI //defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI //defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -35,6 +35,7 @@ const MachineFunction &MF; const SISubtarget &ST; + int getWaitStatesSince(function_ref IsHazard); int getWaitStatesSinceDef(unsigned Reg, function_ref IsHazardDef = [](MachineInstr *) { return true; }); @@ -47,6 +48,8 @@ int checkDivFMasHazards(MachineInstr *DivFMas); int checkGetRegHazards(MachineInstr *GetRegInstr); int checkSetRegHazards(MachineInstr *SetRegInstr); + int createsVALUHazard(const MachineInstr &MI); + int checkVALUHazards(MachineInstr *VALU); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -67,6 +67,9 @@ if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) + return NoopHazard; + if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) return NoopHazard; @@ -90,14 +93,20 @@ if (SIInstrInfo::isSMRD(*MI)) return std::max(0, checkSMRDHazards(MI)); - if (SIInstrInfo::isVMEM(*MI)) - return std::max(0, checkVMEMHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) { + int WaitStates = std::max(0, checkVALUHazards(MI)); - if (SIInstrInfo::isDPP(*MI)) - return std::max(0, checkDPPHazards(MI)); + if (SIInstrInfo::isVMEM(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); - if (isDivFMas(MI->getOpcode())) - return std::max(0, checkDivFMasHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) + WaitStates = std::max(WaitStates, checkDPPHazards(MI)); + + if (isDivFMas(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); + + return WaitStates; + } if (isSGetReg(MI->getOpcode())) return std::max(0, checkGetRegHazards(MI)); @@ -149,32 +158,38 @@ // Helper Functions //===----------------------------------------------------------------------===// -int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref IsHazardDef) { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +int GCNHazardRecognizer::getWaitStatesSince( + function_ref IsHazard) { int WaitStates = -1; for (MachineInstr *MI : EmittedInstrs) { ++WaitStates; - if (!MI || !IsHazardDef(MI)) + if (!MI || !IsHazard(MI)) continue; - if (MI->modifiesRegister(Reg, TRI)) - return WaitStates; + return WaitStates; } return std::numeric_limits::max(); } +int GCNHazardRecognizer::getWaitStatesSinceDef( + unsigned Reg, function_ref IsHazardDef) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { + return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); + }; + + return getWaitStatesSince(IsHazardFn); +} + int GCNHazardRecognizer::getWaitStatesSinceSetReg( function_ref IsHazard) { - int WaitStates = -1; - for (MachineInstr *MI : EmittedInstrs) { - ++WaitStates; - if (!MI || !isSSetReg(MI->getOpcode()) || !IsHazard(MI)) - continue; - return WaitStates; - } - return std::numeric_limits::max(); + auto IsHazardFn = [IsHazard] (MachineInstr *MI) { + return isSSetReg(MI->getOpcode()) && IsHazard(MI); + }; + + return getWaitStatesSince(IsHazardFn); } //===----------------------------------------------------------------------===// @@ -350,3 +365,75 @@ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); return SetRegWaitStates - WaitStatesNeeded; } + +int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = TII->get(Opcode); + + if (!MI.mayStore()) + return -1; + + int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); + const TargetRegisterClass *VDataRC = nullptr; + if (VDataIdx != -1) + VDataRC = TRI->getRegClass(Desc.OpInfo[VDataIdx].RegClass); + + if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { + // For MUBUF/MTBUF instructions this hazard only exists if the + // instruction is not using a register in the soffset field. + const MachineOperand *SOffset = + TII->getNamedOperand(MI, AMDGPU::OpName::soffset); + // If we have no soffset operand, then assume this field has been + // hardcoded to zero. + if (VDataRC->getSize() > 8 && (!SOffset || !SOffset->isReg())) + return VDataIdx; + } + + // MIMG instructions create a hazard if they don't use a 256-bit T# and + // the store size is greater than 8 bytes and they have more than two bits + // of their dmask set. + // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. + if (TII->isMIMG(MI)) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); + assert(SRsrcIdx != -1 && + TRI->getRegClass(Desc.OpInfo[SRsrcIdx].RegClass)->getSize() == 32); + } + + if (TII->isFLAT(MI)) { + int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::data); + if (TRI->getRegClass(Desc.OpInfo[DataIdx].RegClass)->getSize() > 8) + return DataIdx; + } + + return -1; +} + +int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo(); + + // This checks for the hazard where VMEM instructions that store more than + // 8 bytes can have there store data over written by the next instruction. + if (!ST.has12DWordStoreHazard()) + return 0; + + const int VALUWaitStates = 1; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Def : VALU->defs()) { + if (!TRI->isVGPR(MRI, Def.getReg())) + continue; + unsigned Reg = Def.getReg(); + auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { + int DataIdx = createsVALUHazard(*MI); + return DataIdx >= 0 && + TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); + }; + int WaitStatesNeededForDef = + VALUWaitStates - getWaitStatesSince(IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + return WaitStatesNeeded; +} Index: test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir =================================================================== --- test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir +++ test/CodeGen/MIR/AMDGPU/inserted-wait-states.mir @@ -1,11 +1,12 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN -# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,VI +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI --- | define void @div_fmas() { ret void } define void @s_getreg() { ret void } define void @s_setreg() { ret void } + define void @vmem_gt_8dw_store() { ret void } ... --- # GCN-LABEL: name: div_fmas @@ -159,3 +160,77 @@ S_SETREG_B32 %sgpr1, 0 S_ENDPGM ... + +... +--- +# GCN-LABEL: name: vmem_gt_8dw_store + +# GCN-LABEL: bb.0: +# GCN: BUFFER_STORE_DWORD_OFFSET +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_DWORDX3_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_DWORDX4_OFFSET +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_DWORDX4_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_FORMAT_XYZ_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 + +# GCN-LABEL: bb.1: +# GCN: FLAT_STORE_DWORDX2 +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_STORE_DWORDX3 +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_STORE_DWORDX4 +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_ATOMIC_CMPSWAP_X2 +# CIVI: S_NOP +# GCN-NEXT: V_MOV_B32 +# GCN: FLAT_ATOMIC_FCMPSWAP_X2 +# CIVI: S_NOP +# GCN: V_MOV_B32 + +name: vmem_gt_8dw_store + +body: | + bb.0: + successors: %bb.1 + BUFFER_STORE_DWORD_OFFSET %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_DWORDX3_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_FORMAT_XYZ_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_STORE_FORMAT_XYZW_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + BUFFER_ATOMIC_CMPSWAP_X2_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit %exec + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + S_BRANCH %bb.1 + + bb.1: + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3 = V_MOV_B32_e32 0, implicit %exec + S_ENDPGM + +...