diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -571,7 +571,7 @@ MachineOperand &Inactive = MI.getOperand(2); if (Inactive.isReg()) { if (Inactive.isUndef()) { - LowerToCopyInstrs.push_back(&MI); + LowerToMovInstrs.push_back(&MI); } else { markOperand(MI, Inactive, StateStrictWWM, Worklist); } @@ -1492,7 +1492,18 @@ void SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToMovInstrs) { - assert(MI->getNumExplicitOperands() == 2); + if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || + MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { + assert(MI->getNumExplicitOperands() == 3); + // the only reason we should be here is V_SET_INACTIVE has + // an undef input so it is being replaced by a simple copy. + // There should be a second undef source that we should remove. + assert(MI->getOperand(2).isUndef()); + MI->removeOperand(2); + MI->untieRegOperand(1); + } else { + assert(MI->getNumExplicitOperands() == 2); + } const Register Reg = MI->getOperand(0).getReg(); @@ -1525,22 +1536,8 @@ LLVM_DEBUG(dbgs() << " -> " << *MI); } } - for (MachineInstr *MI : LowerToCopyInstrs) { - if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { - assert(MI->getNumExplicitOperands() == 3); - // the only reason we should be here is V_SET_INACTIVE has - // an undef input so it is being replaced by a simple copy. - // There should be a second undef source that we should remove. - assert(MI->getOperand(2).isUndef()); - MI->removeOperand(2); - MI->untieRegOperand(1); - } else { - assert(MI->getNumExplicitOperands() == 2); - } - + for (MachineInstr *MI : LowerToCopyInstrs) MI->setDesc(TII->get(AMDGPU::COPY)); - } } void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -21,6 +21,21 @@ ret void } +define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { +; GCN-LABEL: set_inactive_imm_poison: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 + store i32 %tmp, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: @@ -43,6 +58,22 @@ ret void } +define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { +; GCN-LABEL: set_inactive_imm_poison_64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 + store i64 %tmp, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: @@ -58,20 +89,20 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 -; GCN-NEXT: s_cbranch_scc1 .LBB2_3 +; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow ; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN-NEXT: s_cbranch_vccz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %.exit +; GCN-NEXT: s_cbranch_vccz .LBB4_4 +; GCN-NEXT: .LBB4_2: ; %.exit ; GCN-NEXT: s_endpgm -; GCN-NEXT: .LBB2_3: ; %.one +; GCN-NEXT: .LBB4_3: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[2:3], 0 -; GCN-NEXT: s_cbranch_execnz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %.zero +; GCN-NEXT: s_cbranch_execnz .LBB4_2 +; GCN-NEXT: .LBB4_4: ; %.zero ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1171,12 +1171,12 @@ ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen -; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec ; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(1) +; GFX9-W64-NEXT: v_mov_b32_e32 v1, v1 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 @@ -1192,8 +1192,9 @@ ; GFX10-W32-NEXT: s_clause 0x1 ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec ; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; GFX10-W32-NEXT: s_waitcnt vmcnt(1) +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v2 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2