diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2491,9 +2491,7 @@ return; } case AMDGPU::G_CTPOP: - case AMDGPU::G_BITREVERSE: - case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ_ZERO_UNDEF: { + case AMDGPU::G_BITREVERSE: { const RegisterBank *DstBank = OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::SGPRRegBank) @@ -2515,6 +2513,38 @@ llvm_unreachable("narrowScalar should have succeeded"); return; } + case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_UNDEF: { + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::SGPRRegBank) + break; + + Register SrcReg = MI.getOperand(1).getReg(); + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(SrcReg); + if (Ty == S32) + break; + + // We can narrow this more efficiently than Helper can by using ffbh/ffbl + // which return -1 when the input is zero: + // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), 32 + (ffbh lo)) + // (cttz_zero_undef hi:lo) -> (umin 32 + (ffbl hi), (ffbl lo)) + ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); + MachineIRBuilder B(MI, ApplyVALU); + SmallVector SrcRegs(OpdMapper.getVRegs(1)); + unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF + ? AMDGPU::G_AMDGPU_FFBH_U32 + : AMDGPU::G_AMDGPU_FFBL_B32; + unsigned Idx = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF; + auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); + auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); + Y = B.buildAdd(S32, Y, B.buildConstant(S32, 32)); + Register DstReg = MI.getOperand(0).getReg(); + B.buildUMin(DstReg, X, Y); + MI.eraseFromParent(); + return; + } case AMDGPU::G_SEXT: case AMDGPU::G_ZEXT: case AMDGPU::G_ANYEXT: { @@ -3729,7 +3759,13 @@ break; } case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_UNDEF: { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); + OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); + OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); + break; + } case AMDGPU::G_CTPOP: { unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir @@ -60,14 +60,12 @@ ; CHECK-LABEL: name: ctlz_zero_undef_s64_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[C]] - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s32) - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTLZ_ZERO_UNDEF]], [[C1]] - ; CHECK: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[CTLZ_ZERO_UNDEF1]] - ; CHECK: S_ENDPGM 0, implicit [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBH_U32_1]], [[C]] + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[ADD]] + ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir @@ -60,14 +60,12 @@ ; CHECK-LABEL: name: cttz_zero_undef_s64_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV]](s32), [[C]] - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTTZ_ZERO_UNDEF]], [[C1]] - ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) - ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[CTTZ_ZERO_UNDEF1]] - ; CHECK: S_ENDPGM 0, implicit [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBL_B32_1]], [[C]] + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[ADD]] + ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -778,20 +778,19 @@ ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 64, vcc_lo -; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 64, vcc_lo +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -900,10 +899,9 @@ ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v2 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[1:2] +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 64, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -599,17 +599,16 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] +; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -698,10 +697,9 @@ ; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v2 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -770,20 +770,19 @@ ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 64, vcc_lo -; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 64, vcc_lo +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -892,10 +891,9 @@ ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[1:2] +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 64, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1065,16 +1065,15 @@ ; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_bfe_u32 v3, v3, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v2, v2, 16, v0 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v3 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v2 -; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3 +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2 +; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4 ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc ; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] ; GFX9-GISEL-NEXT: s_endpgm