Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1848,6 +1848,40 @@ if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) continue; + if (OpToFold.isReg()) { + // Fold vgpr to vgpr copy with an intermediate readfirstlane + // + // %0:vgpr_32 = COPY $vgpr0 + // %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + // %2:vgpr_32 = COPY %1 + // + // => %2 = COPY %0 + // + MachineInstr *SrcDef = MRI->getVRegDef(OpToFold.getReg()); + if (SrcDef && SrcDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32 && + MRI->hasOneUse(OpToFold.getReg()) && + !TRI->isSGPRReg(*MRI, MI.getOperand(0).getReg())) { + // TODO: Should also fold through reg_sequence + if (!execMayBeModifiedBeforeUse(*MRI, OpToFold.getReg(), *SrcDef, + MI)) { + OpToFold.setReg(SrcDef->getOperand(1).getReg()); + OpToFold.setSubReg(SrcDef->getOperand(1).getSubReg()); + SrcDef->eraseFromParent(); + MRI->clearKillFlags(OpToFold.getReg()); + + // FIXME: Do we need to make this a convergent move? + // If this was an ordinary copy, we need to track the exec + // dependency. + if (MI.isCopy()) + MI.addOperand( + MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + + Changed = true; + continue; + } + } + } + // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -87,8 +87,7 @@ ; GCN-NEXT: .LBB1_2: ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v1, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -181,8 +180,7 @@ ; GCN-NEXT: .LBB3_2: ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v1, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -279,10 +277,9 @@ ; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc ; GCN-NEXT: .LBB5_2: ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v4, v1, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -33,9 +33,8 @@ ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -61,8 +60,7 @@ ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -89,8 +87,7 @@ ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -38,8 +38,7 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -68,11 +67,10 @@ ; GFX89-NEXT: buffer_wbinvl1_vol ; GFX89-NEXT: .LBB0_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX89-NEXT: v_readfirstlane_b32 s4, v1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s3, 0xf000 ; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; @@ -627,13 +625,11 @@ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -664,10 +660,6 @@ ; GFX89-NEXT: .LBB3_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s2, v0 -; GFX89-NEXT: v_readfirstlane_b32 s3, v1 -; GFX89-NEXT: v_mov_b32_e32 v0, s2 -; GFX89-NEXT: v_mov_b32_e32 v1, s3 ; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX89-NEXT: s_mov_b32 s3, 0xf000 ; GFX89-NEXT: s_mov_b32 s2, -1 @@ -790,15 +782,13 @@ ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v3, s0, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 +; GFX7LESS-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -834,13 +824,11 @@ ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -878,13 +866,11 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1712,13 +1698,11 @@ ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1749,15 +1733,13 @@ ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1788,15 +1770,13 @@ ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1921,15 +1901,13 @@ ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v3, s0, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 +; GFX7LESS-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1965,13 +1943,11 @@ ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -2009,13 +1985,11 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -37,9 +37,8 @@ ; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -66,11 +65,9 @@ ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -95,11 +92,9 @@ ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -753,13 +748,11 @@ ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -786,10 +779,6 @@ ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -818,10 +807,6 @@ ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -932,14 +917,12 @@ ; GFX7LESS-NEXT: s_mov_b32 s4, s0 ; GFX7LESS-NEXT: s_mov_b32 s5, s1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v3, s2, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 +; GFX7LESS-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -968,18 +951,16 @@ ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 +; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -1012,15 +993,13 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1918,13 +1897,11 @@ ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1952,12 +1929,10 @@ ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1985,12 +1960,10 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v3, 5, v2 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2105,14 +2078,12 @@ ; GFX7LESS-NEXT: s_mov_b32 s4, s0 ; GFX7LESS-NEXT: s_mov_b32 s5, s1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v3, s2, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 +; GFX7LESS-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -2141,18 +2112,16 @@ ; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 +; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -2185,15 +2154,13 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -34,8 +34,7 @@ ; GFX7-NEXT: .LBB0_3: ; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 @@ -68,8 +67,7 @@ ; GFX89-NEXT: .LBB0_3: ; GFX89-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s4, v1 -; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -32,9 +32,8 @@ ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -60,8 +59,7 @@ ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -88,8 +86,7 @@ ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -33,9 +33,8 @@ ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -62,8 +61,7 @@ ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -91,8 +89,7 @@ ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] Index: llvm/test/CodeGen/AMDGPU/fold-readlane.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-readlane.mir +++ llvm/test/CodeGen/AMDGPU/fold-readlane.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -run-pass si-fold-operands -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass si-fold-operands -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: fold-imm-readfirstlane{{$}} # GCN: %1:sreg_32_xm0 = S_MOV_B32 123 @@ -388,3 +388,196 @@ %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec ... + +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_virtreg{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %2:vgpr_32 = COPY %0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %2 +--- +name: fold_readfirstlane_into_copy_to_vgpr_virtreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:vgpr_32 = COPY %1 + S_NOP 0, implicit %2 +... + +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_virtreg_kill{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: S_NOP 0, implicit %0 +# GCN-NEXT: %2:vgpr_32 = COPY %0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %2 +--- +name: fold_readfirstlane_into_copy_to_vgpr_virtreg_kill +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + S_NOP 0, implicit killed %0 + %2:vgpr_32 = COPY killed %1 + S_NOP 0, implicit %2 +... + +# Make sure we don't delete def for the other user. +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_virtreg_multi_use{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %1 +# GCN-NEXT: %2:vgpr_32 = COPY killed %1 +# GCN-NEXT: S_NOP 0, implicit %2 +--- +name: fold_readfirstlane_into_copy_to_vgpr_virtreg_multi_use +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + S_NOP 0, implicit %1 + %2:vgpr_32 = COPY killed %1 + S_NOP 0, implicit %2 +... + +# GCN-LABEL: name: copy_undef_virtreg_sgpr{{$}} +# GCN: %0:vgpr_32 = COPY undef %1:sgpr_32 +--- +name: copy_undef_virtreg_sgpr +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = COPY undef %1:sgpr_32 + S_NOP 0, implicit %0 +... + +# GCN-LABEL: name: fold_readfirstlane_physreg_into_copy_to_vgpr_virtreg{{$}} +# GCN: %1:vgpr_32 = COPY $vgpr0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %1 +--- +name: fold_readfirstlane_physreg_into_copy_to_vgpr_virtreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec + %2:vgpr_32 = COPY %1 + S_NOP 0, implicit %2 +... + +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_physreg{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: $vgpr0 = COPY %0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit $vgpr0 +--- +name: fold_readfirstlane_into_copy_to_vgpr_physreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + $vgpr0 = COPY %1 + S_NOP 0, implicit $vgpr0 +... + +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_virtreg_execdef{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit-def $exec +# GCN-NEXT: %2:vgpr_32 = COPY %1 +--- +name: fold_readfirstlane_into_copy_to_vgpr_virtreg_execdef +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + S_NOP 0, implicit-def $exec + %2:vgpr_32 = COPY %1 + S_NOP 0, implicit %2 +... + +# GCN-LABEL: name: no_fold_readfirstlane_into_copy_to_sgpr_virtreg{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +# GCN-NEXT: %2:sgpr_32 = COPY %1 +--- +name: no_fold_readfirstlane_into_copy_to_sgpr_virtreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:sgpr_32 = COPY %1 + S_NOP 0, implicit %2 +... + +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_agpr_virtreg{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %2:agpr_32 = COPY %0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %2 +--- +name: fold_readfirstlane_into_copy_to_agpr_virtreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:agpr_32 = COPY %1 + S_NOP 0, implicit %2 +... + +# TODO: Should be able to handle this +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_virtreg_reg_sequence{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = COPY $vgpr1 +# GCN-NEXT: %2:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %0, implicit $exec +# GCN-NEXT: %3:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %1, implicit $exec +# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 +# GCN-NEXT: %5:vreg_64 = COPY %4 +# GCN-NEXT: S_NOP 0, implicit %5 + +--- +name: fold_readfirstlane_into_copy_to_vgpr_virtreg_reg_sequence +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %0, implicit $exec + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vreg_64 = COPY %4 + S_NOP 0, implicit %5 +... + +# GCN-LABEL: name: fold_readfirstlane_into_copy_to_vgpr_virtreg_reg_sequence_extract{{$}} +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN-NEXT: %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec +# GCN-NEXT: %4:sreg_32_xm0 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec +# GCN-NEXT: %5:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 +# GCN-NEXT: %6:vreg_64 = COPY %5 +# GCN-NEXT: S_NOP 0, implicit %6 +--- +name: fold_readfirstlane_into_copy_to_vgpr_virtreg_reg_sequence_extract +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY %0.sub0 + %2:vgpr_32 = COPY %0.sub1 + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %1, implicit $exec + %4:sreg_32_xm0 = V_READFIRSTLANE_B32 killed %2, implicit $exec + %5:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vreg_64 = COPY %5 + S_NOP 0, implicit %6 +... Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -5,6 +5,19 @@ ; CHECK-LABEL: {{^}}test_readfirstlane: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) + call void asm sideeffect "; use $0","s"(i32 %readfirstlane) + ret void +} + +; The readfirstlane is copied right back to a VGPR, so this is +; eliminated. +; CHECK-LABEL: {{^}}test_readfirstlane_copyback_v: +; CHECK: s_waitcnt +; CHECK-NEXT: flat_store_dword +; CHECK-NEXT: s_waitcnt +; CHECK-NEXT: s_setpc_b64 +define void @test_readfirstlane_copyback_v(i32 addrspace(1)* %out, i32 %src) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void Index: llvm/test/CodeGen/AMDGPU/wave-id-computation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave-id-computation.ll +++ llvm/test/CodeGen/AMDGPU/wave-id-computation.ll @@ -100,8 +100,7 @@ ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 26, v1 ; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 6, v0 -; CHECK-NEXT: v_readfirstlane_b32 s4, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: %i = tail call i32 @llvm.amdgcn.workgroup.id.x()