Index: lib/Target/AMDGPU/SIDefines.h
===================================================================
--- lib/Target/AMDGPU/SIDefines.h
+++ lib/Target/AMDGPU/SIDefines.h
@@ -38,7 +38,8 @@
   FLAT = 1 << 19,
   WQM = 1 << 20,
   VGPRSpill = 1 << 21,
-  VOPAsmPrefer32Bit = 1 << 22
+  VOPAsmPrefer32Bit = 1 << 22,
+  DPP = 1 << 23
 };
 }
 
Index: lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaits.cpp
+++ lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -119,6 +119,8 @@
   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 
+  void insertDPPWaitStates(MachineBasicBlock::iterator DPP);
+
   /// Return true if there are LGKM instrucitons that haven't been waited on
   /// yet.
   bool hasOutstandingLGKM() const;
@@ -483,6 +485,24 @@
   }
 }
 
+void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP) {
+  MachineBasicBlock::iterator I = DPP;
+  MachineBasicBlock::iterator E = DPP->getParent()->end();
+  --I;
+
+  for (unsigned WaitStates = 2; WaitStates > 0 && I != E; --I, --WaitStates) {
+    for (MachineOperand &Op : I->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+
+      if (DPP->readsRegister(Op.getReg(), TRI)) {
+        TII->insertWaitStates(DPP, WaitStates);
+        return;
+      }
+    }
+  }
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -549,6 +569,10 @@
         }
       }
 
+      if (TII->isDPP(*I)) {
+        insertDPPWaitStates(I);
+      }
+
       // Wait for everything before a barrier.
       if (I->getOpcode() == AMDGPU::S_BARRIER)
         Changes |= insertWait(MBB, I, LastIssued);
Index: lib/Target/AMDGPU/SIInstrFormats.td
===================================================================
--- lib/Target/AMDGPU/SIInstrFormats.td
+++ lib/Target/AMDGPU/SIInstrFormats.td
@@ -40,6 +40,7 @@
   field bits<1> FLAT = 0;
   field bits<1> WQM = 0;
   field bits<1> VGPRSpill = 0;
+  field bits<1> DPP = 0;
 
   // This bit tells the assembler to use the 32-bit encoding in case it
   // is unable to infer the encoding from the operands.
@@ -73,6 +74,7 @@
   let TSFlags{20} = WQM;
   let TSFlags{21} = VGPRSpill;
   let TSFlags{22} = VOPAsmPrefer32Bit;
+  let TSFlags{23} = DPP;
 
   let SchedRW = [Write32Bit];
 
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -301,6 +301,14 @@
     return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
   }
 
+  static bool isDPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DPP;
+  }
+
+  bool isDPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DPP;
+  }
+
   bool isInlineConstant(const APInt &Imm) const;
   bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
   bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
Index: lib/Target/AMDGPU/VIInstrFormats.td
===================================================================
--- lib/Target/AMDGPU/VIInstrFormats.td
+++ lib/Target/AMDGPU/VIInstrFormats.td
@@ -172,6 +172,7 @@
 class VOP_DPP <dag outs, dag ins, string asm, list<dag> pattern> :
     VOPAnyCommon <outs, ins, asm, pattern> {
   let Size = 8;
+  let DPP = 1;
 }
 
 class VOP_DPPe : Enc64 {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
 
+; FIXME: The register allocator / scheduler should be able to avoid these hazards.
+
 ; VI-LABEL: {{^}}dpp_test:
+; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
+; VI: s_nop 1
 ; VI: v_mov_b32 v0, v0, 1, -1, 1, 1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
 define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i1 1, i32 1, i32 1) #0
@@ -8,6 +12,20 @@
   ret void
 }
 
+; VI-LABEL: {{^}}dpp_wait_states:
+; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
+; VI: s_nop 1
+; VI: v_mov_b32 [[VGPR1:v[0-9]+]], [[VGPR0]], 1, -1, 1, 1
+; VI: s_nop 1
+; VI: v_mov_b32 v{{[0-9]+}}, [[VGPR1]], 1, -1, 1, 1
+define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i1 1, i32 1, i32 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i1 1, i32 1, i32 1) #0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i1, i32, i32) #0
 
 attributes #0 = { nounwind readnone convergent }