Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -119,6 +119,18 @@
   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 
+  /// \param DPP The DPP instruction
+  /// \param SearchI The iterator to start look for hazards.
+  /// \param SearchMBB The basic block we are operating on.
+  /// \param WaitStates Then number of wait states that need to be inserted
+  ///                    When a hazard is detected.
+  void insertDPPWaitStates(MachineBasicBlock::iterator DPP,
+                           MachineBasicBlock::reverse_iterator SearchI,
+                           MachineBasicBlock *SearchMBB,
+                           unsigned WaitStates);
+
+  void insertDPPWaitStates(MachineBasicBlock::iterator DPP);
+
   /// Return true if there are LGKM instrucitons that haven't been waited on
   /// yet.
   bool hasOutstandingLGKM() const;
@@ -480,6 +492,45 @@
   }
 }
 
+void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP,
+                                        MachineBasicBlock::reverse_iterator SearchI,
+                                        MachineBasicBlock *SearchMBB,
+                                        unsigned WaitStates) {
+
+  MachineBasicBlock::reverse_iterator E = SearchMBB->rend();
+
+  for (; WaitStates > 0; --WaitStates, ++SearchI) {
+
+    // If we have reached the start of the block, we need to check predecessors.
+    if (SearchI == E) {
+      for (MachineBasicBlock *Pred : SearchMBB->predecessors()) {
+        // We only need to check fall-through blocks.  Branch instructions
+        // give us enough wait states.
+        if (Pred->getFirstTerminator() == Pred->end()) {
+          insertDPPWaitStates(DPP, Pred->rbegin(), Pred, WaitStates);
+          break;
+        }
+      }
+      return;
+    }
+
+    for (MachineOperand &Op : SearchI->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+
+      if (DPP->readsRegister(Op.getReg(), TRI)) {
+        TII->insertWaitStates(DPP, WaitStates);
+        return;
+      }
+    }
+  }
+}
+
+void SIInsertWaits::insertDPPWaitStates(MachineBasicBlock::iterator DPP) {
+  MachineBasicBlock::reverse_iterator I(DPP);
+  insertDPPWaitStates(DPP, I, DPP->getParent(), 2);
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -546,6 +597,10 @@
         }
       }
 
+      if (TII->isDPP(*I)) {
+        insertDPPWaitStates(I);
+      }
+
       // Wait for everything before a barrier.
       if (I->getOpcode() == AMDGPU::S_BARRIER)
         Changes |= insertWait(MBB, I, LastIssued);
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
@@ -301,6 +301,14 @@
     return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
   }
 
+  static bool isDPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DPP;
+  }
+
+  bool isDPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DPP;
+  }
+
   bool isInlineConstant(const APInt &Imm) const;
   bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
   bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
 
+; FIXME: The register allocator / scheduler should be able to avoid these hazards.
+
 ; VI-LABEL: {{^}}dpp_test:
+; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
+; VI: s_nop 1
 ; VI: v_mov_b32_dpp v0, v0 quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
 define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
@@ -8,6 +12,51 @@
   ret void
 }
 
+; VI-LABEL: {{^}}dpp_wait_states:
+; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+
+; VI-LABEL: {{^}}dpp_first_in_bb:
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:1 row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+  %cmp = fcmp oeq float %cond, 0.0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %out_val = load float, float addrspace(1)* %out
+  %if_val = fadd float %a, %out_val
+  br label %endif
+
+else:
+  %in_val = load float, float addrspace(1)* %in
+  %else_val = fadd float %b, %in_val
+  br label %endif
+
+endif:
+  %val = phi float [%if_val, %if], [%else_val, %else]
+  %val_i32 = bitcast float %val to i32
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp_float = bitcast i32 %tmp2 to float
+  store float %tmp_float, float addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
 
 attributes #0 = { nounwind readnone convergent }