Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -119,6 +119,18 @@ /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + /// \param DPP The DPP instruction + /// \param SearchI The iterator to start look for hazards. + /// \param SearchMBB The basic block we are operating on. + /// \param WaitStates Then number of wait states that need to be inserted + /// When a hazard is detected. + void insertDPPWaitStates(MachineBasicBlock::iterator DPP, + MachineBasicBlock::reverse_iterator SearchI, + MachineBasicBlock *SearchMBB, + unsigned WaitStates); + + void insertDPPWaitStates(MachineBasicBlock::iterator DPP); + /// Return true if there are LGKM instrucitons that haven't been waited on /// yet. bool hasOutstandingLGKM() const; @@ -480,6 +492,45 @@ } } +void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP, + MachineBasicBlock::reverse_iterator SearchI, + MachineBasicBlock *SearchMBB, + unsigned WaitStates) { + + MachineBasicBlock::reverse_iterator E = SearchMBB->rend(); + + for (; WaitStates > 0; --WaitStates, ++SearchI) { + + // If we have reached the start of the block, we need to check predecessors. + if (SearchI == E) { + for (MachineBasicBlock *Pred : SearchMBB->predecessors()) { + // We only need to check fall-through blocks. Branch instructions + // give us enough wait states. + if (Pred->getFirstTerminator() == Pred->end()) { + insertDPPWaitStates(DPP, Pred->rbegin(), Pred, WaitStates); + break; + } + } + return; + } + + for (MachineOperand &Op : SearchI->operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + + if (DPP->readsRegister(Op.getReg(), TRI)) { + TII->insertWaitStates(DPP, WaitStates); + return; + } + } + } +} + +void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP) { + MachineBasicBlock::reverse_iterator I(DPP); + insertDPPWaitStates(DPP, I, DPP->getParent(), 2); +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -546,6 +597,10 @@ } } + if (TII->isDPP(*I)) { + insertDPPWaitStates(I); + } + // Wait for everything before a barrier. if (I->getOpcode() == AMDGPU::S_BARRIER) Changes |= insertWait(MBB, I, LastIssued); Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -301,6 +301,14 @@ return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } + static bool isDPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DPP; + } + + bool isDPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DPP; + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -1,6 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s +; FIXME: The register allocator / scheduler should be able to avoid these hazards. + ; VI-LABEL: {{^}}dpp_test: +; VI: v_mov_b32_e32 v0, s{{[0-9]+}} +; VI: s_nop 1 ; VI: v_mov_b32_dpp v0, v0 quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] define void @dpp_test(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 @@ -8,6 +12,51 @@ ret void } +; VI-LABEL: {{^}}dpp_wait_states: +; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} +; VI: s_nop 1 +; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI: s_nop 1 +; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 + %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + +; VI-LABEL: {{^}}dpp_first_in_bb: +; VI: s_nop 1 +; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI: s_nop 1 +; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI: s_nop 1 +; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) { + %cmp = fcmp oeq float %cond, 0.0 + br i1 %cmp, label %if, label %else + +if: + %out_val = load float, float addrspace(1)* %out + %if_val = fadd float %a, %out_val + br label %endif + +else: + %in_val = load float, float addrspace(1)* %in + %else_val = fadd float %b, %in_val + br label %endif + +endif: + %val = phi float [%if_val, %if], [%else_val, %else] + %val_i32 = bitcast float %val to i32 + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0 + %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 + %tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0 + %tmp_float = bitcast i32 %tmp2 to float + store float %tmp_float, float addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0 attributes #0 = { nounwind readnone convergent }