Index: lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.h +++ lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -84,6 +84,7 @@ int checkAnyInstHazards(MachineInstr *MI); int checkReadM0Hazards(MachineInstr *SMovRel); int checkNSAtoVMEMHazard(MachineInstr *MI); + int checkFPAtomicToDenromModeHazard(MachineInstr *MI); void fixHazards(MachineInstr *MI); bool fixVcmpxPermlaneHazards(MachineInstr *MI); bool fixVMEMtoScalarWriteHazards(MachineInstr *MI); Index: lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -145,6 +145,9 @@ if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) return NoopHazard; + if (checkFPAtomicToDenromModeHazard(MI) > 0) + return NoopHazard; + if (ST.hasNoDataDepHazard()) return NoHazard; @@ -247,6 +250,8 @@ if (ST.hasNSAtoVMEMBug()) WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); + WaitStates = std::max(WaitStates, checkFPAtomicToDenromModeHazard(MI)); + if (ST.hasNoDataDepHazard()) return WaitStates; @@ -1138,3 +1143,46 @@ return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); } + +int GCNHazardRecognizer::checkFPAtomicToDenromModeHazard(MachineInstr *MI) { + int FPAtomicToDenromModeWaitStates = 3; + + if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) + return 0; + + auto IsHazardFn = [] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) + return false; + if (!I->mayLoad() || !I->mayStore() || I->memoperands_empty()) + return false; + const MachineMemOperand *MemOp = *I->memoperands_begin(); + if (!MemOp->isAtomic()) + return false; + const Value *V = MemOp->getValue(); + return !V || !V->getType()->isPointerTy() || // assume worst. + V->getType()->getPointerElementType()->isFloatTy(); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { + if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) + return true; + + switch (MI->getOpcode()) { + case AMDGPU::S_WAITCNT: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + case AMDGPU::S_WAITCNT_LGKMCNT: + case AMDGPU::S_WAITCNT_IDLE: + return true; + default: + break; + } + + return false; + }; + + + return FPAtomicToDenromModeWaitStates - + ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); +} Index: test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir @@ -0,0 +1,41 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1000 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: flat_fp_atomic_to_s_denorm_mode +# GCN: FLAT_ATOMIC_FMIN +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_DENORM_MODE +--- +name: flat_fp_atomic_to_s_denorm_mode +body: | + bb.0: + FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + S_DENORM_MODE 0 +... + +# GCN-LABEL: name: flat_fp_atomic_to_s_denorm_mode_waitcnt +# GCN: FLAT_ATOMIC_FMIN +# GCN-NEXT: S_WAITCNT +# GCN-NEXT: S_DENORM_MODE +--- +name: flat_fp_atomic_to_s_denorm_mode_waitcnt +body: | + bb.0: + FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + S_WAITCNT 0 + S_DENORM_MODE 0 +... + +# GCN-LABEL: name: flat_fp_atomic_to_s_denorm_mode_valu +# GCN: FLAT_ATOMIC_FMIN +# GCN-NEXT: V_ADD_F32_e32 +# GCN-NEXT: S_DENORM_MODE +--- +name: flat_fp_atomic_to_s_denorm_mode_valu +body: | + bb.0: + FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = V_ADD_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, implicit $exec + S_DENORM_MODE 0 +...