diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2491,9 +2491,7 @@ return; } case AMDGPU::G_CTPOP: - case AMDGPU::G_BITREVERSE: - case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ_ZERO_UNDEF: { + case AMDGPU::G_BITREVERSE: { const RegisterBank *DstBank = OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::SGPRRegBank) @@ -2515,6 +2513,38 @@ llvm_unreachable("narrowScalar should have succeeded"); return; } + case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_UNDEF: { + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::SGPRRegBank) + break; + + Register SrcReg = MI.getOperand(1).getReg(); + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(SrcReg); + if (Ty == S32) + break; + + // We can narrow this more efficiently than Helper can by using ffbh/ffbl + // which return -1 when the input is zero: + // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), 32 + (ffbh lo)) + // (cttz_zero_undef hi:lo) -> (umin 32 + (ffbl hi), (ffbl lo)) + ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); + MachineIRBuilder B(MI, ApplyVALU); + SmallVector SrcRegs(OpdMapper.getVRegs(1)); + unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF + ? AMDGPU::G_AMDGPU_FFBH_U32 + : AMDGPU::G_AMDGPU_FFBL_B32; + unsigned Idx = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF; + auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); + auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); + Y = B.buildAdd(S32, Y, B.buildConstant(S32, 32)); + Register DstReg = MI.getOperand(0).getReg(); + B.buildUMin(DstReg, X, Y); + MI.eraseFromParent(); + return; + } case AMDGPU::G_SEXT: case AMDGPU::G_ZEXT: case AMDGPU::G_ANYEXT: { @@ -3729,7 +3759,13 @@ break; } case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_UNDEF: { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); + OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); + OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size); + break; + } case AMDGPU::G_CTPOP: { unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1085,14 +1085,12 @@ ; SI-NEXT: s_movk_i32 s6, 0xff ; SI-NEXT: v_and_b32_e32 v0, s6, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; SI-NEXT: v_ffbh_u32_e32 v2, v0 ; SI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; SI-NEXT: v_ffbh_u32_e32 v3, v1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; SI-NEXT: v_mov_b32_e32 v3, 0xbe -; SI-NEXT: v_sub_i32_e32 v4, vcc, v3, v2 +; SI-NEXT: v_ffbh_u32_e32 v3, v0 +; SI-NEXT: v_ffbh_u32_e32 v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 +; SI-NEXT: v_min_u32_e32 v2, v2, v3 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 0xbe, v2 ; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 @@ -1117,14 +1115,12 @@ ; VI-NEXT: s_movk_i32 s6, 0xff ; VI-NEXT: v_and_b32_e32 v0, s6, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 -; VI-NEXT: v_ffbh_u32_e32 v3, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0xbe -; VI-NEXT: v_sub_u32_e32 v4, vcc, v3, v2 +; VI-NEXT: v_ffbh_u32_e32 v3, v0 +; VI-NEXT: v_ffbh_u32_e32 v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 +; VI-NEXT: v_min_u32_e32 v2, v2, v3 +; VI-NEXT: v_sub_u32_e32 v4, vcc, 0xbe, v2 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1] ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 @@ -1153,11 +1149,10 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_movk_i32 s4, 0xff ; SI-NEXT: v_and_b32_e32 v0, s4, v0 -; SI-NEXT: v_ffbh_u32_e32 v2, v0 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; SI-NEXT: v_ffbh_u32_e32 v3, 0 -; SI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 -; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_ffbh_u32_e32 v3, v0 +; SI-NEXT: v_ffbh_u32_e32 v2, 0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 +; SI-NEXT: v_min_u32_e32 v2, v2, v3 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_sub_i32_e32 v4, vcc, 0xbe, v2 ; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 @@ -1183,11 +1178,10 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0xff ; VI-NEXT: v_and_b32_e32 v0, s4, v0 -; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 -; VI-NEXT: v_ffbh_u32_e32 v3, 0 -; VI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_ffbh_u32_e32 v3, v0 +; VI-NEXT: v_ffbh_u32_e32 v2, 0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 +; VI-NEXT: v_min_u32_e32 v2, v2, v3 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_sub_u32_e32 v4, vcc, 0xbe, v2 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir @@ -60,14 +60,12 @@ ; CHECK-LABEL: name: ctlz_zero_undef_s64_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[C]] - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s32) - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTLZ_ZERO_UNDEF]], [[C1]] - ; CHECK: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[CTLZ_ZERO_UNDEF1]] - ; CHECK: S_ENDPGM 0, implicit [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBH_U32_1]], [[C]] + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[ADD]] + ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTLZ_ZERO_UNDEF %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir @@ -60,14 +60,12 @@ ; CHECK-LABEL: name: cttz_zero_undef_s64_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV]](s32), [[C]] - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTTZ_ZERO_UNDEF]], [[C1]] - ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) - ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[CTTZ_ZERO_UNDEF1]] - ; CHECK: S_ENDPGM 0, implicit [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBL_B32_1]], [[C]] + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[ADD]] + ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 S_ENDPGM 0, implicit %1