diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -87,29 +87,29 @@ }; enum WaitEventType { - VMEM_ACCESS, // vector-memory read & write - VMEM_READ_ACCESS, // vector-memory read - VMEM_WRITE_ACCESS,// vector-memory write - LDS_ACCESS, // lds read & write - GDS_ACCESS, // gds read & write - SQ_MESSAGE, // send message - SMEM_ACCESS, // scalar-memory read & write - EXP_GPR_LOCK, // export holding on its data src - GDS_GPR_LOCK, // GDS holding on its data and addr src - EXP_POS_ACCESS, // write to export position - EXP_PARAM_ACCESS, // write to export parameter - VMW_GPR_LOCK, // vector-memory write holding on its data src + VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_WRITE_ACCESS, // vector-memory write + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + EXP_LDS_ACCESS, // read by ldsdir counting as export NUM_WAIT_EVENTS, }; static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { - (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), - (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | - (1 << SQ_MESSAGE), - (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | - (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), - (1 << VMEM_WRITE_ACCESS) -}; + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), + (1 << VMEM_WRITE_ACCESS)}; // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs @@ -596,6 +596,12 @@ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), CurrScore); } + } else if (TII->isLDSDIR(Inst)) { + // LDSDIR instructions attach the score to the destination. + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), + CurrScore); } else { if (TII->isEXP(Inst)) { // For export the destination registers are really temps that @@ -1135,7 +1141,7 @@ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); ScoreBrackets.clearVgprVmemTypes(RegNo); } - if (Op.isDef()) { + if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } @@ -1192,6 +1198,19 @@ ScoreBrackets.applyWaitcnt(Wait); } + // ExpCnt can be merged into VINTERP. + if (Wait.ExpCnt != ~0u && SIInstrInfo::isVINTERP(MI)) { + MachineOperand *WaitExp = TII->getNamedOperand(MI, AMDGPU::OpName::waitexp); + if (Wait.ExpCnt < WaitExp->getImm()) { + WaitExp->setImm(Wait.ExpCnt); + Modified = true; + } + Wait.ExpCnt = ~0u; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Update Instr: " << MI); + } + // Build new waitcnt instructions unless no wait is needed or the old waitcnt // instruction was modified to handle the required wait. if (Wait.hasWaitExceptVsCnt()) { @@ -1350,6 +1369,11 @@ // May need to way wait for anything. ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); } + } else if (SIInstrInfo::isLDSDIR(Inst)) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); + } else if (TII->isVINTERP(Inst)) { + int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); + ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); } else if (SIInstrInfo::isEXP(Inst)) { unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -9,8 +9,8 @@ ; GCN-NEXT: lds_param_load v0, attr0.y ; GCN-NEXT: lds_param_load v1, attr1.x ; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 ; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7 ; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done @@ -36,10 +36,10 @@ ; GCN-NEXT: lds_param_load v2, attr2.x ; GCN-NEXT: lds_param_load v3, attr3.x ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 ; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7 @@ -73,10 +73,10 @@ ; GCN-NEXT: lds_param_load v4, attr2.x ; GCN-NEXT: lds_param_load v5, attr3.x ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 ; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 @@ -111,7 +111,7 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: lds_param_load v0, attr0.x ; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 ; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -9,8 +9,8 @@ ; GCN-NEXT: lds_param_load v0, attr0.y ; GCN-NEXT: lds_param_load v1, attr1.x ; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 ; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7 ; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done @@ -36,10 +36,10 @@ ; GCN-NEXT: lds_param_load v2, attr2.x ; GCN-NEXT: lds_param_load v3, attr3.x ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 ; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7 @@ -73,10 +73,10 @@ ; GCN-NEXT: lds_param_load v4, attr2.x ; GCN-NEXT: lds_param_load v5, attr3.x ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7 -; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7 +; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 ; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7 ; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 @@ -111,7 +111,7 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: lds_param_load v0, attr0.x ; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7 +; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 ; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7