diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -171,8 +171,6 @@ void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); - bool requiresCorrectState(const MachineInstr &MI) const; - MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before); MachineBasicBlock::iterator @@ -526,36 +524,6 @@ return GlobalFlags; } -/// Whether \p MI really requires the exec state computed during analysis. -/// -/// Scalar instructions must occasionally be marked WQM for correct propagation -/// (e.g. thread masks leading up to branches), but when it comes to actual -/// execution, they don't care about EXEC. -bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { - if (MI.isTerminator()) - return true; - - // Skip instructions that are not affected by EXEC - if (TII->isScalarUnit(MI)) - return false; - - // Generic instructions such as COPY will either disappear by register - // coalescing or be lowered to SALU or VALU instructions. - if (MI.isTransient()) { - if (MI.getNumExplicitOperands() >= 1) { - const MachineOperand &Op = MI.getOperand(0); - if (Op.isReg()) { - if (TRI->isSGPRReg(*MRI, Op.getReg())) { - // SGPR instructions are not affected by EXEC - return false; - } - } - } - } - - return true; -} - MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { @@ -742,7 +710,7 @@ if (II != IE) { MachineInstr &MI = *II; - if (requiresCorrectState(MI)) { + if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { if (III->second.Needs & StateWWM) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -375,9 +375,9 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -428,9 +428,9 @@ ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -480,9 +480,9 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -539,10 +539,10 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 @@ -614,9 +614,9 @@ ; ; GFX8-LABEL: add_i32_varying_gfx1032: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -667,9 +667,9 @@ ; ; GFX9-LABEL: add_i32_varying_gfx1032: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -719,9 +719,9 @@ ; ; GFX1064-LABEL: add_i32_varying_gfx1032: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -778,10 +778,10 @@ ; ; GFX1032-LABEL: add_i32_varying_gfx1032: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 @@ -853,9 +853,9 @@ ; ; GFX8-LABEL: add_i32_varying_gfx1064: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -906,9 +906,9 @@ ; ; GFX9-LABEL: add_i32_varying_gfx1064: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -958,9 +958,9 @@ ; ; GFX1064-LABEL: add_i32_varying_gfx1064: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -1017,10 +1017,10 @@ ; ; GFX1032-LABEL: add_i32_varying_gfx1064: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 @@ -1934,9 +1934,9 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -1987,9 +1987,9 @@ ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -2039,9 +2039,9 @@ ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -2098,10 +2098,10 @@ ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 @@ -2917,9 +2917,9 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -2970,9 +2970,9 @@ ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -3022,9 +3022,9 @@ ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -3081,10 +3081,10 @@ ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 @@ -3159,9 +3159,9 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -3212,9 +3212,9 @@ ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -3264,9 +3264,9 @@ ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -3323,10 +3323,10 @@ ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 @@ -4265,9 +4265,9 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -4318,9 +4318,9 @@ ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -4370,9 +4370,9 @@ ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 @@ -4429,10 +4429,10 @@ ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -107,3 +107,38 @@ S_ENDPGM 0 ... + +--- +# Ensure that wwm is not put around an EXEC copy +#CHECK-LABEL: name: copy_exec +#CHECK: %7:sreg_64 = COPY $exec +#CHECK-NEXT: %14:sreg_64 = ENTER_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec +#CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +#CHECK-NEXT: $exec = EXIT_WWM %14 +#CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec +name: copy_exec +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %4:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 + %5:sreg_32 = S_MOV_B32 0 + %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, implicit $exec + + %8:sreg_64 = COPY $exec + %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec + %11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec + %12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63 + early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec + + %14:vgpr_32 = COPY %13 + BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 + +...