diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -89,6 +89,8 @@ bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -941,7 +941,7 @@ .clampScalar(1, S32, S64) .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32) - .lower(); + .custom(); // The 64-bit versions produce 32-bit results, but only on the SALU. getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) @@ -1759,6 +1759,9 @@ return legalizeFFloor(MI, MRI, B); case TargetOpcode::G_BUILD_VECTOR: return legalizeBuildVector(MI, MRI, B); + case TargetOpcode::G_CTLZ: + case TargetOpcode::G_CTTZ: + return legalizeCTLZ_CTTZ(MI, MRI, B); default: return false; } @@ -2744,6 +2747,27 @@ return true; } +// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to +// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input +// case with a single min instruction instead of a compare+select. +bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + + unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ + ? AMDGPU::G_AMDGPU_FFBH_U32 + : AMDGPU::G_AMDGPU_FFBL_B32; + auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); + B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); + + MI.eraseFromParent(); + return true; +} + // Check that this is a G_XOR x, -1 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { if (MI.getOpcode() != TargetOpcode::G_XOR) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2513,6 +2513,8 @@ llvm_unreachable("narrowScalar should have succeeded"); return; } + case AMDGPU::G_AMDGPU_FFBH_U32: + case AMDGPU::G_AMDGPU_FFBL_B32: case AMDGPU::G_CTLZ_ZERO_UNDEF: case AMDGPU::G_CTTZ_ZERO_UNDEF: { const RegisterBank *DstBank = @@ -2528,18 +2530,26 @@ // We can narrow this more efficiently than Helper can by using ffbh/ffbl // which return -1 when the input is zero: - // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), 32 + (ffbh lo)) - // (cttz_zero_undef hi:lo) -> (umin 32 + (ffbl hi), (ffbl lo)) + // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) + // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) + // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) + // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); MachineIRBuilder B(MI, ApplyVALU); SmallVector SrcRegs(OpdMapper.getVRegs(1)); unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF ? AMDGPU::G_AMDGPU_FFBH_U32 - : AMDGPU::G_AMDGPU_FFBL_B32; - unsigned Idx = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF; + : Opc == AMDGPU::G_CTLZ_ZERO_UNDEF + ? AMDGPU::G_AMDGPU_FFBL_B32 + : Opc; + unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32; auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]}); auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]}); - Y = B.buildAdd(S32, Y, B.buildConstant(S32, 32)); + unsigned AddOpc = + Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF + ? AMDGPU::G_ADD + : AMDGPU::G_UADDSAT; + Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)}); Register DstReg = MI.getOperand(0).getReg(); B.buildUMin(DstReg, X, Y); MI.eraseFromParent(); @@ -3651,8 +3661,6 @@ case AMDGPU::G_INTRINSIC_TRUNC: case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? case AMDGPU::G_FSHR: // TODO: Expand for scalar - case AMDGPU::G_AMDGPU_FFBH_U32: - case AMDGPU::G_AMDGPU_FFBL_B32: case AMDGPU::G_AMDGPU_FMIN_LEGACY: case AMDGPU::G_AMDGPU_FMAX_LEGACY: case AMDGPU::G_AMDGPU_RCP_IFLAG: @@ -3758,6 +3766,8 @@ OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); break; } + case AMDGPU::G_AMDGPU_FFBH_U32: + case AMDGPU::G_AMDGPU_FFBL_B32: case AMDGPU::G_CTLZ_ZERO_UNDEF: case AMDGPU::G_CTTZ_ZERO_UNDEF: { unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir @@ -9,12 +9,10 @@ liveins: $vgpr0 ; CHECK-LABEL: name: ctlz_s32_s32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] + ; CHECK: $vgpr0 = COPY [[UMIN]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CTLZ %0 $vgpr0 = COPY %1 @@ -28,12 +26,10 @@ liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: ctlz_s32_s64 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s64), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] + ; CHECK: $vgpr0 = COPY [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTLZ %0 $vgpr0 = COPY %1 @@ -47,12 +43,10 @@ liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: ctlz_s64_s64 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s64), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UMIN]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CTLZ %0 @@ -67,14 +61,12 @@ liveins: $vgpr0 ; CHECK-LABEL: name: ctlz_s16_s32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] ; CHECK: $vgpr0 = COPY [[AND]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_CTLZ %0 @@ -93,13 +85,11 @@ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C1]] - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C3]] + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[AND]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C2]] ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] @@ -120,15 +110,12 @@ ; CHECK-LABEL: name: ctlz_v2s32_v2s32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s32) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV]](s32), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[C]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[CTLZ_ZERO_UNDEF1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] + ; CHECK: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) + ; CHECK: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_CTLZ %0 @@ -144,15 +131,12 @@ ; CHECK-LABEL: name: ctlz_v2s32_v2s64 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s64) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV]](s64), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s64) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV1]](s64), [[C]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[CTLZ_ZERO_UNDEF1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] + ; CHECK: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s64) + ; CHECK: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s32>) = G_CTLZ %0 @@ -173,19 +157,16 @@ ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32) - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C2]] - ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C3]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C]] + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[AND]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C2]] + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C]] ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; CHECK: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND1]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND1]](s32), [[C2]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C3]], [[CTLZ_ZERO_UNDEF1]] - ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C]] + ; CHECK: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[AND1]](s32) + ; CHECK: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C2]] + ; CHECK: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C]] ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] @@ -212,13 +193,11 @@ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[C1]] - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C3]] + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[AND]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C2]] ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] @@ -242,15 +221,13 @@ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591 ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] - ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s64) - ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s64), [[C1]] - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[CTLZ_ZERO_UNDEF]] - ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[SELECT]](s32) - ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 31 + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[AND]](s64) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UMIN]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 31 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64) - ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C3]](s64) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; CHECK: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] ; CHECK: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; CHECK: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir @@ -9,12 +9,10 @@ liveins: $vgpr0 ; CHECK-LABEL: name: cttz_s32_s32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTTZ_ZERO_UNDEF]] - ; CHECK: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]] + ; CHECK: $vgpr0 = COPY [[UMIN]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CTTZ %0 $vgpr0 = COPY %1 @@ -28,12 +26,10 @@ liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: cttz_s32_s64 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s64), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTTZ_ZERO_UNDEF]] - ; CHECK: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]] + ; CHECK: $vgpr0 = COPY [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTTZ %0 $vgpr0 = COPY %1 @@ -47,12 +43,10 @@ liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: cttz_s64_s64 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s64) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s64), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTTZ_ZERO_UNDEF]] - ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[SELECT]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UMIN]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CTTZ %0 @@ -67,14 +61,12 @@ liveins: $vgpr0 ; CHECK-LABEL: name: cttz_s16_s32 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTTZ_ZERO_UNDEF]] - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] ; CHECK: $vgpr0 = COPY [[AND]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_CTTZ %0 @@ -116,15 +108,12 @@ ; CHECK-LABEL: name: cttz_v2s32_v2s32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV]](s32), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTTZ_ZERO_UNDEF]] - ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[C]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[CTTZ_ZERO_UNDEF1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]] + ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s32) + ; CHECK: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_1]], [[C]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_CTTZ %0 @@ -140,15 +129,12 @@ ; CHECK-LABEL: name: cttz_v2s32_v2s64 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s64) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV]](s64), [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[CTTZ_ZERO_UNDEF]] - ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s64) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV1]](s64), [[C]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[CTTZ_ZERO_UNDEF1]] - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[UV]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]] + ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s64) + ; CHECK: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_1]], [[C]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s32>) = G_CTTZ %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir @@ -12,8 +12,7 @@ ; CHECK-LABEL: name: ffbh_u32_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[COPY1]](s32) + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:sgpr(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = G_AMDGPU_FFBH_U32 %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbl-b32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbl-b32.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbl-b32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbl-b32.mir @@ -12,8 +12,7 @@ ; CHECK-LABEL: name: ffbl_b32_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[COPY1]](s32) + ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:sgpr(s32) = G_AMDGPU_FFBL_B32 [[COPY]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = G_AMDGPU_FFBL_B32 %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir @@ -60,11 +60,11 @@ ; CHECK-LABEL: name: cttz_zero_undef_s64_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV]](s32) - ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s32) + ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) + ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32 - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBL_B32_1]], [[C]] - ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[ADD]] + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTTZ_ZERO_UNDEF1]], [[C]] + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[CTTZ_ZERO_UNDEF]], [[ADD]] ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CTTZ_ZERO_UNDEF %0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -82,8 +82,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 -; GFX10-GISEL-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-GISEL-NEXT: s_cselect_b32 s0, 32, s0 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm @@ -172,13 +171,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -281,16 +279,14 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 32, vcc_lo +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -417,22 +413,18 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v5, v1 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v6, v2 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v7, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 32, vcc_lo +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 ; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -536,10 +528,8 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 32, vcc_lo +; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm @@ -636,8 +626,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX10-GISEL-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-GISEL-NEXT: s_cselect_b32 s0, 64, s0 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -732,8 +721,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX10-GISEL-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-GISEL-NEXT: s_cselect_b32 s0, 64, s0 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -850,13 +838,12 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v0 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 64, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -976,12 +963,11 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v1 -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v2 -; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[1:2] -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 64, vcc_lo +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1075,9 +1061,9 @@ ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1170,9 +1156,8 @@ ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 32, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -1278,13 +1263,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -1389,13 +1373,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -1498,10 +1481,10 @@ ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v0, 24, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1606,10 +1589,10 @@ ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 32, vcc_lo -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 16, v1 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1713,10 +1696,10 @@ ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v0, 25, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -82,8 +82,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 -; GFX10-GISEL-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-GISEL-NEXT: s_cselect_b32 s0, 32, s0 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm @@ -172,13 +171,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -281,16 +279,14 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 32, vcc_lo +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -417,22 +413,18 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v5, v1 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v6, v2 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v7, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 32, vcc_lo -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 32, vcc_lo +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 ; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -625,8 +617,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] -; GFX10-GISEL-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-GISEL-NEXT: s_cselect_b32 s0, 64, s0 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -721,8 +712,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] -; GFX10-GISEL-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-GISEL-NEXT: s_cselect_b32 s0, 64, s0 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -839,13 +829,12 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v1 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 64, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -965,12 +954,11 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v2 -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[1:2] -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 64, vcc_lo +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1064,9 +1052,9 @@ ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1159,9 +1147,8 @@ ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 32, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -1267,13 +1254,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -1378,13 +1364,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 32, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]