Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -314,13 +314,16 @@ class DS_GWS_0D : DS_GWS; + (ins offset:$offset, gds:$gds), "$offset gds"> { + let hasSideEffects = 1; +} class DS_GWS_1D : DS_GWS { let has_data0 = 1; + let hasSideEffects = 1; } class DS_VOID : DS_PseudogetInstrInfo(); + auto I = MI.getIterator(); + auto E = std::next(I); + + BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + MIBundleBuilder Bundler(*MBB, I, E); + finalizeBundle(*MBB, Bundler.begin()); +} + MachineBasicBlock * SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3062,8 +3076,7 @@ MRI.setSimpleHint(Data0, Src->getReg()); } - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + bundleInstWithWaitcnt(MI); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -3782,8 +3795,12 @@ case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: case AMDGPU::DS_GWS_BARRIER: - if (getSubtarget()->hasGWSAutoReplay()) + // A s_waitcnt 0 is required to be the instruction immediately following. + if (getSubtarget()->hasGWSAutoReplay()) { + bundleInstWithWaitcnt(MI); return BB; + } + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1392,7 +1392,7 @@ break; } case TargetOpcode::BUNDLE: { - if (!MI.mayLoad()) + if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) return false; // If it is a load it must be a memory clause Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -4,6 +4,11 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s +; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s + + ; Minimum offset ; GCN-LABEL: {{^}}gws_barrier_offset0: ; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] @@ -18,11 +23,19 @@ ; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) ; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] + +; MIR-LABEL: name: gws_barrier_offset0{{$}} +; MIR: BUNDLE implicit{{( killed)?}} $vgpr0, implicit $m0, implicit $exec { +; MIR-NEXT: DS_GWS_BARRIER $vgpr0, 1, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource) +; MIR-NEXT: S_WAITCNT 0 +; MIR-NEXT: } define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void } +; MIR-LABEL: name: gws_barrier_offset63{{$}} + ; Maximum offset ; GCN-LABEL: {{^}}gws_barrier_offset63: ; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] @@ -103,7 +116,7 @@ ; Make sure this increments lgkmcnt ; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: ; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}} -; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_setpc_b64 define void @gws_barrier_lgkmcnt(i32 %val) { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) @@ -122,7 +135,7 @@ ; GCN-LABEL: {{^}}gws_barrier_wait_after: ; NOLOOP: ds_gws_barrier v0 offset:8 gds -; NOLOOP-NEXT: s_waitcnt expcnt(0){{$}} +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) @@ -135,6 +148,7 @@ ; NOLOOP: store_dword ; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0) ; NOLOOP: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr fence release @@ -142,9 +156,11 @@ ret void } +; FIXME: Extra waitcnt ; GCN-LABEL: {{^}}gws_barrier_fence_after: ; NOLOOP: ds_gws_barrier v0 offset:8 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { @@ -158,7 +174,9 @@ ; GCN-LABEL: {{^}}gws_init_barrier: ; NOLOOP: s_mov_b32 m0, -1 ; NOLOOP: ds_gws_init v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) @@ -169,9 +187,11 @@ ; GCN-LABEL: {{^}}gws_init_fence_barrier: ; NOLOOP: s_mov_b32 m0, -1 ; NOLOOP: ds_gws_init v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) fence release Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -111,7 +111,7 @@ ; GCN-LABEL: {{^}}gws_init_lgkmcnt: ; NOLOOP: ds_gws_init v0 offset:1 gds{{$}} -; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_setpc_b64 define void @gws_init_lgkmcnt(i32 %val) { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) @@ -120,8 +120,10 @@ ; Does not imply memory fence on its own ; GCN-LABEL: {{^}}gws_init_wait_before: -; NOLOOP: s_waitcnt +; NOLOOP: s_waitcnt lgkmcnt(0) ; NOLOOP-NOT: s_waitcnt +; NOLOOP: ds_gws_init +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)