diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4006,10 +4006,29 @@ Src2.setReg(RegOp2); } - if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) { - BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) - .addReg(Src2.getReg()) - .addImm(0); + const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); + if (TRI->getRegSizeInBits(*Src2RC) == 64) { + if (ST.hasScalarCompareEq64()) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) + .addReg(Src2.getReg()) + .addImm(0); + } else { + const TargetRegisterClass *SubRC = + TRI->getSubRegClass(Src2RC, AMDGPU::sub0); + MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( + MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC); + MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( + MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC); + Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) + .add(Src2Sub0) + .add(Src2Sub1); + + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Src2_32, RegState::Kill) + .addImm(0); + } } else { BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32)) .addReg(Src2.getReg()) diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -1,8 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define i32 @s_add_co_select_user() { +; GFX7-LABEL: s_add_co_select_user: +; GFX7: ; %bb.0: ; %bb +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s6, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_cmp_lg_u32 s4, 0 +; GFX7-NEXT: s_addc_u32 s4, s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_cselect_b64 vcc, 1, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cmp_gt_u32_e64 vcc, s6, 31 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: s_add_co_select_user: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -52,6 +70,32 @@ } define amdgpu_kernel void @s_add_co_br_user(i32 %i) { +; GFX7-LABEL: s_add_co_br_user: +; GFX7: ; %bb.0: ; %bb +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s1, s0, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 +; GFX7-NEXT: s_or_b32 s1, vcc_lo, vcc_hi +; GFX7-NEXT: s_cmp_lg_u32 s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7-NEXT: s_addc_u32 s0, s0, 0 +; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0 +; GFX7-NEXT: s_and_b64 vcc, exec, vcc +; GFX7-NEXT: s_cbranch_vccnz BB1_2 +; GFX7-NEXT: ; %bb.1: ; %bb0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 9 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: BB1_2: ; %bb1 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 10 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0