diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1590,6 +1590,7 @@ } bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { @@ -1675,8 +1676,8 @@ case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); BuildMI(MBB, MI, DL, get(NotOpc), Exec) @@ -1687,8 +1688,8 @@ case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MI, DL, get(NotOpc), Exec) - .addReg(Exec); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) .add(MI.getOperand(2)); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -145,6 +145,7 @@ // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. +let Defs = [SCC] in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VGPR_32: $src, VSrc_b32:$inactive), [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { @@ -156,6 +157,7 @@ [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { let Constraints = "$src = $vdst"; } +} // End Defs = [SCC] let usesCustomInserter = 1, Defs = [VCC, EXEC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -1 +1,103 @@ -; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.set.inactive.ll | FileCheck -check-prefix=GCN %S/../llvm.amdgcn.set.inactive.ll +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) { +; GCN-LABEL: set_inactive: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm + %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + store i32 %tmp, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) { +; GCN-LABEL: set_inactive_64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_endpgm + %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 + store i64 %tmp, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { +; GCN-LABEL: set_inactive_scc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cbranch_scc0 BB2_2 +; GCN-NEXT: ; %bb.1: ; %.one +; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NEXT: s_branch BB2_3 +; GCN-NEXT: BB2_2: +; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: BB2_3: ; %Flow +; GCN-NEXT: s_xor_b32 s0, s0, -1 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cbranch_scc1 BB2_5 +; GCN-NEXT: ; %bb.4: ; %.zero +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: BB2_5: ; %.exit +; GCN-NEXT: s_endpgm + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %cmp = icmp eq i32 %val, 56 + %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + br i1 %cmp, label %.zero, label %.one + +.zero: + store i32 %tmp, i32 addrspace(1)* %out + br label %.exit + +.one: + %tmp.1 = add i32 %tmp, 1 + store i32 %tmp.1, i32 addrspace(1)* %out + br label %.exit + +.exit: + ret void +} + +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 +declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) + +attributes #0 = { convergent readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -23,7 +23,31 @@ ret void } +; GCN-LABEL: {{^}}set_inactive_scc: +; GCN: s_cmp +; GCN-NOT: s_not +; GCN: s_cbranch_scc +define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) { + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) + %cmp = icmp eq i32 %val, 56 + %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 + br i1 %cmp, label %.zero, label %.one + +.zero: + store i32 %tmp, i32 addrspace(1)* %out + br label %.exit + +.one: + %tmp.1 = add i32 %tmp, 1 + store i32 %tmp.1, i32 addrspace(1)* %out + br label %.exit + +.exit: + ret void +} + declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) attributes #0 = { convergent readnone } diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -99,7 +99,7 @@ %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec %16:vgpr_32 = COPY %8.sub1 %11:vgpr_32 = COPY %16 - %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec + %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc %14:vgpr_32 = COPY %7 %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec