diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -601,9 +601,22 @@ SCCCopy) .addImm(-1) .addImm(0); - BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), - TII->get(AMDGPU::COPY), DstReg) - .addReg(SCCCopy); + I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), + TII->get(AMDGPU::COPY), DstReg) + .addReg(SCCCopy); + MI.eraseFromParent(); + continue; + } else if (DstReg == AMDGPU::SCC) { + unsigned Opcode = + ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; + Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO; + Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); + I = BuildMI(*MI.getParent(), + std::next(MachineBasicBlock::iterator(MI)), + MI.getDebugLoc(), TII->get(Opcode)) + .addReg(Tmp, getDefRegState(true)) + .addReg(SrcReg) + .addReg(Exec); MI.eraseFromParent(); continue; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -807,7 +807,7 @@ ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: global_store_short v1, v0, s[2:3] @@ -832,7 +832,7 @@ ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX90A-NEXT: global_store_short v1, v0, s[2:3] @@ -916,7 +916,7 @@ ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 @@ -945,7 +945,7 @@ ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 @@ -1202,7 +1202,7 @@ ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3] @@ -1227,7 +1227,7 @@ ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] @@ -1312,7 +1312,7 @@ ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 @@ -1343,7 +1343,7 @@ ; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 ; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 @@ -3915,7 +3915,7 @@ ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_ashr_i32 s1, s6, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 @@ -3931,8 +3931,8 @@ ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 ; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_sext_i32_i16 s1, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 @@ -3947,7 +3947,7 @@ ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: s_ashr_i32 s1, s7, 16 @@ -3964,7 +3964,7 @@ ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 @@ -3993,7 +3993,7 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 ; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 @@ -4009,8 +4009,8 @@ ; GFX90A-NEXT: v_mad_f32 v1, -v4, v0, v1 ; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_sext_i32_i16 s1, s7 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 @@ -4025,7 +4025,7 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: s_ashr_i32 s1, s7, 16 @@ -4042,7 +4042,7 @@ ; GFX90A-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v6 @@ -4273,7 +4273,7 @@ ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_ashr_i32 s9, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 @@ -4291,7 +4291,7 @@ ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 ; GFX9-NEXT: s_sext_i32_i16 s0, s7 @@ -4308,7 +4308,7 @@ ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s6, 0 ; GFX9-NEXT: s_ashr_i32 s6, s7, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 @@ -4325,7 +4325,7 @@ ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s9, 0 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 @@ -4360,7 +4360,7 @@ ; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 ; GFX90A-NEXT: s_ashr_i32 s8, s6, 16 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s8 @@ -4378,7 +4378,7 @@ ; GFX90A-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v1| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 ; GFX90A-NEXT: v_add_u32_e32 v1, s0, v4 ; GFX90A-NEXT: s_sext_i32_i16 s0, s7 @@ -4396,7 +4396,7 @@ ; GFX90A-NEXT: v_mad_f32 v1, -v5, v3, v1 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: s_ashr_i32 s4, s7, 16 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 @@ -4414,7 +4414,7 @@ ; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 @@ -4695,7 +4695,7 @@ ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 @@ -4721,7 +4721,7 @@ ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 @@ -4808,7 +4808,7 @@ ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 @@ -4840,7 +4840,7 @@ ; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 ; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 @@ -5464,7 +5464,7 @@ ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_ashr_i32 s1, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 @@ -5480,8 +5480,8 @@ ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_sext_i32_i16 s1, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 @@ -5497,7 +5497,7 @@ ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -5525,7 +5525,7 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 ; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 @@ -5541,8 +5541,8 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 ; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_sext_i32_i16 s1, s7 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 @@ -5558,7 +5558,7 @@ ; GFX90A-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -5745,7 +5745,7 @@ ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -5763,7 +5763,7 @@ ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 ; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 @@ -5779,7 +5779,7 @@ ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s7, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 @@ -5812,7 +5812,7 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 ; GFX90A-NEXT: s_ashr_i32 s6, s6, 16 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 @@ -5830,7 +5830,7 @@ ; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX90A-NEXT: s_or_b32 s8, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 ; GFX90A-NEXT: v_add_u32_e32 v2, s0, v4 ; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 @@ -5848,7 +5848,7 @@ ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 @@ -6542,7 +6542,7 @@ ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f @@ -6563,7 +6563,7 @@ ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 ; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 @@ -6612,7 +6612,7 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 ; GFX90A-NEXT: s_bfe_i32 s1, s6, 0xf000f @@ -6633,7 +6633,7 @@ ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 ; GFX90A-NEXT: v_add_u32_e32 v5, s0, v6 @@ -6864,7 +6864,7 @@ ; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f ; GFX9-NEXT: s_or_b32 s11, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s11, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 ; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 @@ -6881,7 +6881,7 @@ ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 @@ -6949,7 +6949,7 @@ ; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f ; GFX90A-NEXT: s_or_b32 s11, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX90A-NEXT: s_bfe_i32 s0, s10, 0xf0000 @@ -6968,7 +6968,7 @@ ; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| ; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll new file mode 100755 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @copy_to_scc(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <4 x i32> addrspace(4)* %addrSrc) { +; GCN-LABEL: copy_to_scc: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_load_dword s10, s[6:7], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:252 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_xor_b64 s[0:1], s[6:7], vcc +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, 2, 3 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: s_endpgm +entry: ; preds = %1009 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %1 = load <4 x i32>, <4 x i32> addrspace(4)* %addrSrc, align 16 + %2 = icmp ne i32 %0, 0 + %3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %1, i32 252, i32 0, i32 0) + %4 = icmp ne i32 %3, 0 + %5 = xor i1 %2, %4 + %result = select i1 %5, i32 2, i32 3 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)