Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -179,6 +179,16 @@ return WaitedOn.Named.LGKM != LastIssued.Named.LGKM; } +static bool hasGlobalMemOperand(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + MachineMemOperand *MMO = *MI.memoperands_begin(); + unsigned AS = MMO->getAddrSpace(); + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { uint64_t TSFlags = MI.getDesc().TSFlags; Counters Result = { { 0, 0, 0 } }; @@ -208,6 +218,11 @@ // XXX - What is the right value? Result.Named.LGKM = 1; } + } else if (TII->isFLAT(MI)) { + // If we know the pointer is not accessing a flat address, we don't need + // to wait for lgkm. + if (!hasGlobalMemOperand(MI)) + Result.Named.LGKM = 1; } else { // DS Result.Named.LGKM = 1; Index: test/CodeGen/AMDGPU/waitcnt-flat.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-flat.ll +++ test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -5,12 +5,52 @@ ; operand, this test is not broken. It just means it is no longer testing ; for the original bug. -; GCN: {{^}}test: +; GCN-LABEL: {{^}}global_test: +; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: flat_load_dword + +; Test pointer problem ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] -define void @test(i32 addrspace(1)* %out, i32 %in) { +define void @global_test(i32 addrspace(1)* %out, i32 %in) { store volatile i32 0, i32 addrspace(1)* %out %val = load volatile i32, i32 addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}flat_test: +; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword +define void @flat_test(i32 addrspace(4)* %out, i32 %in) { + store volatile i32 0, i32 addrspace(4)* %out + %val = load volatile i32, i32 addrspace(4)* %out + ret void +} + +; If the store is not through a generic pointer, the lgkmcnt is not +; needed. + +; GCN-LABEL: {{^}}global_flat_test: +; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: flat_load_dword +define void @global_flat_test(i32 addrspace(1)* %out, i32 %in) { + store volatile i32 0, i32 addrspace(1)* %out + %out.cast = addrspacecast i32 addrspace(1)* %out to i32 addrspace(4)* + %val = load volatile i32, i32 addrspace(4)* %out.cast + ret void +} + +; GCN-LABEL: {{^}}flat_global_test: +; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword +define void @flat_global_test(i32 addrspace(1)* %out, i32 %in) { + %out.cast = addrspacecast i32 addrspace(1)* %out to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %out.cast + %val = load volatile i32, i32 addrspace(1)* %out + ret void +}