Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -735,6 +735,12 @@
             [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
              llvm_i1_ty], [IntrNoMem, IntrConvergent]>;
 
+// llvm.amdgcn.mov.dpp.wqm.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
+def int_amdgcn_mov_dpp_wqm :
+  Intrinsic<[llvm_anyint_ty],
+            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i1_ty], [IntrNoMem, IntrConvergent]>;
+
 def int_amdgcn_s_dcache_wb :
   GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">,
   Intrinsic<[], [], []>;
Index: lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- lib/Target/AMDGPU/VOP1Instructions.td
+++ lib/Target/AMDGPU/VOP1Instructions.td
@@ -642,6 +642,12 @@
   let SubtargetPredicate = isVI;
 }
 
+def V_MOV_B32_dpp_wqm : VOP1_DPP<0x1, !cast<VOP1_Pseudo>("V_MOV_B32_e32")> {
+  let WQM = 1;
+  let isCodeGenOnly = 1;
+}
+
+
 // This is a pseudo variant of the v_movreld_b32 instruction in which the
 // vector operand appears only twice, once as def and once as use. Using this
 // pseudo avoids problems with the Two Address instructions pass.
@@ -671,6 +677,13 @@
                        (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
 >;
 
+def : Pat <
+  (i32 (int_amdgcn_mov_dpp_wqm i32:$src, imm:$dpp_ctrl, imm:$row_mask,
+                     imm:$bank_mask, imm:$bound_ctrl)),
+  (V_MOV_B32_dpp_wqm $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
+                        (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+>;
+
 
 def : Pat<
   (i32 (anyext i16:$src)),
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.wqm.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.wqm.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
+
+; FIXME: The register allocator / scheduler should be able to avoid these hazards.
+
+; VI-LABEL: {{^}}dpp_test:
+; VI: s_wqm_b64 exec, exec
+; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; VI: exp mrt0 v0, v0, v0, v0 done vm
+define amdgpu_ps void @dpp_test(i32 %in) {
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  %data = bitcast i32 %tmp0 to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
+  ret void
+}
+
+; VI-LABEL: {{^}}dpp_wait_states:
+; VI: s_wqm_b64 exec, exec
+; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: exp mrt0 [[VGPR1]], v0, v0, v0 done vm
+define amdgpu_ps void @dpp_wait_states(i32 %in) {
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+  %data = bitcast i32 %tmp1 to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
+  ret void
+}
+
+; VI-LABEL: {{^}}dpp_first_in_bb:
+; VI: s_wqm_b64 exec, exec
+; VI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, s{{\[[0-9]+:[0-9]+\]}}
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}}  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x02,0x01,0x08,0x11]
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}}  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}}  quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; VI: exp mrt0 v{{[0-9]+}}, v0, v0, v0 done vm
+define amdgpu_ps void @dpp_first_in_bb(i32 %idx, i32 %idx2, float %cond, float %a, float %b) {
+  %cmp = fcmp oeq float %cond, 0.0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %out_val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %if_val = fadd float %a, %out_val
+  br label %endif
+
+else:
+  %in_val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx2, i32 0, i1 0, i1 0)
+  %else_val = fadd float %b, %in_val
+  br label %endif
+
+endif:
+  %val = phi float [%if_val, %if], [%else_val, %else]
+  %val_i32 = bitcast float %val to i32
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp1 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp2 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
+  %tmp_float = bitcast i32 %tmp2 to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp_float, float undef, float undef, float undef, i1 true, i1 true) #1
+  ret void
+}
+
+; VI-LABEL: {{^}}llpc.dpdx.f32:
+; VI: s_wqm_b64 exec, exec
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 ; encoding: [0xfa,0x02,0x02,0x7e,0x00,0x00,0x08,0xff]
+; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x55,0x08,0xff]
+define amdgpu_ps float @llpc.dpdx.f32(float %p) {
+  ; Broadcast channel 0 to whole quad
+  %p.i32 = bitcast float %p to i32
+  %p.dpp.i32 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %p.i32, i32 0, i32 15, i32 15, i1 1)
+  %p.dpp = bitcast i32 %p.dpp.i32 to float
+  ; Calculate the delta value
+  %dp.x = fsub float %p, %p.dpp
+  %dp = fmul float %dp.x, 0.5  ; <== Added by me
+  ; Braodcast channel 1 to whole quad (85 = 0x55)
+  %dp.i32 = bitcast float %dp to i32
+  %dpdx.i32 = call i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32 %dp.i32, i32 85, i32 15, i32 15, i1 1)
+  %dpdx = bitcast i32 %dpdx.i32 to float
+  ret float %dpdx
+}
+
+declare i32 @llvm.amdgcn.mov.dpp.wqm.i32(i32, i32, i32, i32, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }