diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -737,6 +737,12 @@ "Hardware automatically inserts waitcnt before barrier" >; +def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier", + "BackOffBarrier", + "true", + "Hardware supports backing off s_barrier if an exception occurs" +>; + def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range", "HasTrigReducedRange", "true", @@ -1025,7 +1031,8 @@ FeatureMadMacF32Insts, FeatureSupportsSRAMECC, FeaturePackedTID, - FullRate64Ops]>; + FullRate64Ops, + FeatureBackOffBarrier]>; def FeatureISAVersion9_0_C : FeatureSet< [FeatureGFX9, @@ -1094,7 +1101,8 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1116,7 +1124,8 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1138,7 +1147,8 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_1_3 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1156,7 +1166,8 @@ FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, @@ -1173,7 +1184,8 @@ FeatureNSAEncoding, FeatureNSAMaxSize13, FeatureWavefrontSize32, - FeatureShaderCyclesRegister]>; + FeatureShaderCyclesRegister, + FeatureBackOffBarrier]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -72,6 +72,7 @@ // Dynamically set bits that enable features. bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; + bool BackOffBarrier; bool UnalignedScratchAccess; bool UnalignedAccessMode; bool HasApertureRegs; @@ -493,6 +494,12 @@ return AutoWaitcntBeforeBarrier; } + /// \returns true if the target supports backing off of s_barrier instructions + /// when an exception is raised. + bool supportsBackOffBarrier() const { + return BackOffBarrier; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1135,12 +1135,12 @@ } } - // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 - // occurs before the instruction. Doing it here prevents any additional - // S_WAITCNTs from being emitted if the instruction was marked as - // requiring a WAITCNT beforehand. + // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does + // not, we need to ensure the subtarget is capable of backing off barrier + // instructions in case there are any outstanding memory operations that may + // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier()) { + !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s + +; Subtargets must wait for outstanding memory instructions before a barrier if +; they cannot back off of the barrier. + +define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 { +; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX9-NO-BACKOFF: ; %bb.0: +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_barrier +; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX9-BACKOFF: ; %bb.0: +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-BACKOFF-NEXT: s_barrier +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX10-BACKOFF: ; %bb.0: +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX10-BACKOFF-NEXT: s_barrier +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31] + %load = load i32, i32* %in + call void @llvm.amdgcn.s.barrier() + store i32 %load, i32* %out + ret void +} + +define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 { +; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX9-NO-BACKOFF: ; %bb.0: +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_barrier +; GFX9-NO-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX9-BACKOFF: ; %bb.0: +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-BACKOFF-NEXT: s_barrier +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX10-BACKOFF: ; %bb.0: +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: s_barrier +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: buffer_gl0_inv +; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31] + %load = load i32, i32* %in + fence syncscope("workgroup") release + call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + store i32 %load, i32* %out + ret void +} + +declare void @llvm.amdgcn.s.barrier() + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir @@ -35,7 +35,7 @@ ; GFX10: S_WAITCNT 0 ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec - ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1 ; GFX10: S_BARRIER ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr ; GFX10: S_WAITCNT 112 @@ -112,7 +112,7 @@ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec ; GFX10: S_WAITCNT 0 - ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1 ; GFX10: S_BARRIER ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr ; GFX10: S_WAITCNT 112 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s ; GCN-LABEL: barrier_vmcnt_global: ; GFX8: flat_load_dword @@ -42,7 +42,7 @@ %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 store i32 0, i32 addrspace(1)* %tmp5, align 4 fence syncscope("singlethread") release - tail call void @llvm.amdgcn.s.barrier() #3 + tail call void @llvm.amdgcn.s.barrier() fence syncscope("singlethread") acquire %tmp6 = add nuw nsw i64 %tmp2, 4294967296 %tmp7 = lshr exact i64 %tmp6, 32 @@ -116,7 +116,7 @@ %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4 store i32 0, i32* %tmp5, align 4 fence syncscope("singlethread") release - tail call void @llvm.amdgcn.s.barrier() #3 + tail call void @llvm.amdgcn.s.barrier() fence syncscope("singlethread") acquire %tmp6 = add nuw nsw i64 %tmp2, 4294967296 %tmp7 = lshr exact i64 %tmp6, 32