Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -812,6 +812,21 @@ !MI.getOperand(1).isUndef(); } +/// \returns true if the callee inserts an s_waitcnt 0 on function entry. +static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { + // Currently all conventions wait, but this may not always be the case. + // + // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make + // senses to omit the wait and do it in the caller. + return true; +} + +/// \returns true if the callee is expected to wait for any outstanding waits +/// before returning. +static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { + return true; +} + /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -927,91 +942,91 @@ } } -#if 0 // TODO: the following code to handle CALL. - // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. - // However, there is a problem with EXP_CNT, because the call cannot - // easily tell if a register is used in the function, and if it did, then - // the referring instruction would have to have an S_WAITCNT, which is - // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs - // before the call. - if (MI.getOpcode() == SC_CALL) { - if (ScoreBrackets->getScoreUB(EXP_CNT) > - ScoreBrackets->getScoreLB(EXP_CNT)) { - ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= CNT_MASK(EXP_CNT); - } - } -#endif - - // FIXME: Should not be relying on memoperands. - // Look at the source operands of every instruction to see if - // any of them results from a previous memory operation that affects - // its current usage. If so, an s_waitcnt instruction needs to be - // emitted. - // If the source operand was defined by a load, add the s_waitcnt - // instruction. - for (const MachineMemOperand *Memop : MI.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - continue; - unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } + if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { + // Don't bother waiting on anything except the call address. The function + // is going to insert a wait on everything in its prolog. This still needs + // to be careful if the call target is a load (e.g. a GOT load). + Wait = AMDGPU::Waitcnt(); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &Op = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); + int CallAddrOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, + CallAddrOpIdx, false); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Op.getReg())) { - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. - - // Two cases are handled for destination operands: - // 1) If the destination operand was defined by a load, add the s_waitcnt - // instruction to guarantee the right WAW order. - // 2) If a destination operand that was used by a recent export/store ins, - // add s_waitcnt on exp_cnt to guarantee the WAR order. - if (MI.mayStore()) { + } else { // FIXME: Should not be relying on memoperands. + // Look at the source operands of every instruction to see if + // any of them results from a previous memory operation that affects + // its current usage. If so, an s_waitcnt instruction needs to be + // emitted. + // If the source operand was defined by a load, add the s_waitcnt + // instruction. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + // VM_CNT is only relevant to vgpr or LDS. ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - } - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &Def = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Def.getReg())) { + + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Op.getReg())) { + // VM_CNT is only relevant to vgpr or LDS. + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + } + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } + } + // End of for loop that looks at all source operands to decide vm_wait_cnt + // and lgk_wait_cnt. + + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. + if (MI.mayStore()) { + // FIXME: Should not be relying on memoperands. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } // End of for loop that looks at all dest operands. + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + MachineOperand &Def = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Def.getReg())) { + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + } + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } + } // End of for loop that looks at all dest operands. + } } // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 @@ -1228,6 +1243,14 @@ } } else if (TII->isSMRD(Inst)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + } else if (Inst.isCall()) { + if (callWaitsOnFunctionReturn(Inst)) { + // Act as a wait on everything + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV)); + } else { + // May need to way wait for anything. + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); + } } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -154,7 +154,7 @@ ; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s3 -; GCN: s_waitcnt vmcnt(0) +; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { @@ -174,7 +174,7 @@ ; GCN-DAG: s_mov_b32 s32, s33 -; GCN: s_waitcnt vmcnt(0) +; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { @@ -205,7 +205,7 @@ ; GCN-DAG: s_mov_b32 s32, s33 -; GCN: s_waitcnt vmcnt(0) +; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { @@ -224,7 +224,7 @@ ; GCN-DAG: s_mov_b32 s32, s33 -; GCN: s_waitcnt vmcnt(0) +; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { @@ -267,8 +267,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v2i64: ; GCN: buffer_load_dwordx4 v[0:3] -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { %val = load <2 x i64>, <2 x i64> addrspace(1)* null call void @external_void_func_v2i64(<2 x i64> %val) @@ -290,8 +290,8 @@ ; GCN: buffer_load_dwordx4 v[0:3] ; GCN: v_mov_b32_e32 v4, 1 ; GCN: v_mov_b32_e32 v5, 2 -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { %load = load <2 x i64>, <2 x i64> addrspace(1)* null %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> @@ -307,8 +307,8 @@ ; GCN-DAG: v_mov_b32_e32 v6, 3 ; GCN-DAG: v_mov_b32_e32 v7, 4 -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { %load = load <2 x i64>, <2 x i64> addrspace(1)* null %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> @@ -483,8 +483,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v2i32: ; GCN: buffer_load_dwordx2 v[0:1] -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { %val = load <2 x i32>, <2 x i32> addrspace(1)* undef call void @external_void_func_v2i32(<2 x i32> %val) @@ -527,8 +527,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32: ; GCN: buffer_load_dwordx4 v[0:3] -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { %val = load <4 x i32>, <4 x i32> addrspace(1)* undef call void @external_void_func_v4i32(<4 x i32> %val) @@ -562,8 +562,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr @@ -591,8 +591,8 @@ ; GCN-DAG: buffer_load_dwordx4 v[4:7], off ; GCN-DAG: buffer_load_dwordx4 v[8:11], off ; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr @@ -609,8 +609,8 @@ ; GCN-DAG: buffer_load_dwordx4 v[20:23], off ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr @@ -647,12 +647,11 @@ ret void } -; FIXME: No wait after call ; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm: ; GCN: v_mov_b32_e32 v0, 42 ; GCN: s_swappc_b64 s[30:31], -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; GCN-NOT: s_waitcnt +; GCN: buffer_store_dword v0, off, s[36:39], 0 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 { %val = call i32 @external_i32_func_i32(i32 42) store volatile i32 %val, i32 addrspace(1)* %out @@ -662,8 +661,8 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: ; GCN: buffer_load_ubyte v0, off ; GCN: buffer_load_dword v1, off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_swappc_b64 +; GCN-NOT: s_waitcnt +; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 Index: test/CodeGen/AMDGPU/call-waitcnt.ll =================================================================== --- test/CodeGen/AMDGPU/call-waitcnt.ll +++ test/CodeGen/AMDGPU/call-waitcnt.ll @@ -17,7 +17,6 @@ ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr @@ -67,7 +66,6 @@ ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s35 ; GCN-NEXT: global_store_dword v[0:1], v32, off @@ -91,7 +89,6 @@ ; GCN-NEXT: s_addc_u32 s7, s7, func.return@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NEXT: global_store_dword v[1:2], v0, off @@ -138,7 +135,7 @@ ret void } -; Need to wait for the address dependency +; No need to wait for the load. define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: tail_call_memory_arg_load: ; GCN: ; %bb.0: @@ -147,7 +144,6 @@ ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 ; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[6:7] %vgpr = load volatile i32, i32 addrspace(3)* %ptr tail call void @func(i32 %vgpr)