Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -38,7 +38,8 @@ FLAT = 1 << 19, WQM = 1 << 20, VGPRSpill = 1 << 21, - VOPAsmPrefer32Bit = 1 << 22 + VOPAsmPrefer32Bit = 1 << 22, + DPP = 1 << 23 }; } Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -119,6 +119,8 @@ /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + void insertDPPWaitStates(MachineBasicBlock::iterator DPP); + /// Return true if there are LGKM instrucitons that haven't been waited on /// yet. bool hasOutstandingLGKM() const; @@ -483,6 +485,24 @@ } } +void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP) { + MachineBasicBlock::iterator I = DPP; + MachineBasicBlock::iterator E = DPP->getParent()->end(); + --I; + + for (unsigned WaitStates = 2; WaitStates > 0 && I != E; --I, --WaitStates) { + for (MachineOperand &Op : I->operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + + if (DPP->readsRegister(Op.getReg(), TRI)) { + TII->insertWaitStates(DPP, WaitStates); + return; + } + } + } +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -549,6 +569,10 @@ } } + if (TII->isDPP(*I)) { + insertDPPWaitStates(I); + } + // Wait for everything before a barrier. if (I->getOpcode() == AMDGPU::S_BARRIER) Changes |= insertWait(MBB, I, LastIssued); Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -40,6 +40,7 @@ field bits<1> FLAT = 0; field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; + field bits<1> DPP = 0; // This bit tells the assembler to use the 32-bit encoding in case it // is unable to infer the encoding from the operands. @@ -73,6 +74,7 @@ let TSFlags{20} = WQM; let TSFlags{21} = VGPRSpill; let TSFlags{22} = VOPAsmPrefer32Bit; + let TSFlags{23} = DPP; let SchedRW = [Write32Bit]; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -301,6 +301,14 @@ return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } + static bool isDPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DPP; + } + + bool isDPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DPP; + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; Index: lib/Target/AMDGPU/VIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/VIInstrFormats.td +++ lib/Target/AMDGPU/VIInstrFormats.td @@ -172,6 +172,7 @@ class VOP_DPP pattern> : VOPAnyCommon { let Size = 8; + let DPP = 1; } class VOP_DPPe : Enc64 { Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -1,6 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s +; FIXME: The register allocator / scheduler should be able to avoid these hazards. + ; VI-LABEL: {{^}}dpp_test: +; VI: v_mov_b32_e32 v0, s{{[0-9]+}} +; VI: s_nop 1 ; VI: v_mov_b32 v0, v0, 1, -1, 1, 1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] define void @dpp_test(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i1 1, i32 1, i32 1) #0 @@ -8,6 +12,20 @@ ret void } +; VI-LABEL: {{^}}dpp_wait_states: +; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} +; VI: s_nop 1 +; VI: v_mov_b32 [[VGPR1:v[0-9]+]], [[VGPR0]], 1, -1, 1, 1 +; VI: s_nop 1 +; VI: v_mov_b32 v{{[0-9]+}}, [[VGPR1]], 1, -1, 1, 1 +define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i1 1, i32 1, i32 1) #0 + %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i1 1, i32 1, i32 1) #0 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} + + declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i1, i32, i32) #0 attributes #0 = { nounwind readnone convergent }