diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -243,6 +243,7 @@ LegalizeResult narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult lowerBitcast(MachineInstr &MI); LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -983,6 +983,8 @@ return narrowScalarCTLZ(MI, TypeIdx, NarrowTy); case TargetOpcode::G_CTTZ: return narrowScalarCTTZ(MI, TypeIdx, NarrowTy); + case TargetOpcode::G_CTPOP: + return narrowScalarCTPOP(MI, TypeIdx, NarrowTy); default: return UnableToLegalize; } @@ -3919,6 +3921,30 @@ } LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy) { + if (TypeIdx != 1) + return UnableToLegalize; + + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + unsigned NarrowSize = NarrowTy.getSizeInBits(); + + if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { + auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); + + auto LoCTPOP = MIRBuilder.buildCTPOP(NarrowTy, UnmergeSrc.getReg(0)); + auto HiCTPOP = MIRBuilder.buildCTPOP(NarrowTy, UnmergeSrc.getReg(1)); + auto Out = MIRBuilder.buildAdd(NarrowTy, HiCTPOP, LoCTPOP); + MIRBuilder.buildZExt(MI.getOperand(0), Out); + + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + +LegalizerHelper::LegalizeResult LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { unsigned Opc = MI.getOpcode(); auto &TII = *MI.getMF()->getSubtarget().getInstrInfo(); @@ -4017,6 +4043,57 @@ MI.getOperand(1).setReg(MIBTmp.getReg(0)); return Legalized; } + case TargetOpcode::G_CTPOP: { + unsigned Size = Ty.getSizeInBits(); + MachineIRBuilder &B = MIRBuilder; + + // Count set bits in blocks of 2 bits. Default approach would be + // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 } + // We use following formula instead: + // B2Count = val - { (val >> 1) & 0x55555555 } + // since it gives same result in blocks of 2 with one instruction less. + auto C_1 = B.buildConstant(Ty, 1); + auto B2Set1LoTo1Hi = B.buildLShr(Ty, MI.getOperand(1).getReg(), C_1); + APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55)); + auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0); + auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0); + auto B2Count = B.buildSub(Ty, MI.getOperand(1).getReg(), B2Count1Hi); + + // In order to get count in blocks of 4 add values from adjacent block of 2. + // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 } + auto C_2 = B.buildConstant(Ty, 2); + auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2); + APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33)); + auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0); + auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0); + auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0); + auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count); + + // For count in blocks of 8 bits we don't have to mask high 4 bits before + // addition since count value sits in range {0,...,8} and 4 bits are enough + // to hold such binary values. After addition high 4 bits still hold count + // of set bits in high 4 bit block, set them to zero and get 8 bit result. + // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F + auto C_4 = B.buildConstant(Ty, 4); + auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4); + auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count); + APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F)); + auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0); + auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0); + + assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm"); + // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this + // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks. + auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01))); + auto ResTmp = B.buildMul(Ty, B8Count, MulMask); + + // Shift count result from 8 high bits to low bits. + auto C_SizeM8 = B.buildConstant(Ty, Size - 8); + B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); + + MI.eraseFromParent(); + return Legalized; + } } } diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp --- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -214,6 +214,10 @@ getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) .lowerFor({{s32, s32}, {s64, s64}}); + getActionDefinitionsBuilder(G_CTPOP) + .lowerFor({{s32, s32}}) + .clampScalar(1, s32, s32); + // FP instructions getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({s32, s64}); diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir @@ -0,0 +1,102 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32 +--- +name: ctpop_i32 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $a0 + + ; MIPS32-LABEL: name: ctpop_i32 + ; MIPS32: liveins: $a0 + ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0 + ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765 + ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND]] + ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; MIPS32: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32) + ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459 + ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]] + ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]] + ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]] + ; MIPS32: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; MIPS32: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32) + ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]] + ; MIPS32: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 + ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] + ; MIPS32: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 + ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] + ; MIPS32: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; MIPS32: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32) + ; MIPS32: $v0 = COPY [[LSHR3]](s32) + ; MIPS32: RetRA implicit $v0 + %0:_(s32) = COPY $a0 + %1:_(s32) = G_CTPOP %0(s32) + $v0 = COPY %1(s32) + RetRA implicit $v0 + +... +--- +name: ctpop_i64 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $a0, $a1 + + ; MIPS32-LABEL: name: ctpop_i64 + ; MIPS32: liveins: $a0, $a1 + ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0 + ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1 + ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655765 + ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[AND]] + ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; MIPS32: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32) + ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 858993459 + ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]] + ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C3]] + ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]] + ; MIPS32: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; MIPS32: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C4]](s32) + ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR2]], [[ADD]] + ; MIPS32: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 + ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] + ; MIPS32: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 + ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] + ; MIPS32: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; MIPS32: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32) + ; MIPS32: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) + ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]] + ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[AND4]] + ; MIPS32: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C2]](s32) + ; MIPS32: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C3]] + ; MIPS32: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C3]] + ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[AND6]] + ; MIPS32: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[ADD2]], [[C4]](s32) + ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR6]], [[ADD2]] + ; MIPS32: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C5]] + ; MIPS32: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C6]] + ; MIPS32: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C7]](s32) + ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[LSHR7]], [[LSHR3]] + ; MIPS32: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; MIPS32: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ADD4]](s32), [[C8]](s32) + ; MIPS32: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) + ; MIPS32: $v0 = COPY [[UV]](s32) + ; MIPS32: $v1 = COPY [[UV1]](s32) + ; MIPS32: RetRA implicit $v0, implicit $v1 + %1:_(s32) = COPY $a0 + %2:_(s32) = COPY $a1 + %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32) + %3:_(s64) = G_CTPOP %0(s64) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3(s64) + $v0 = COPY %4(s32) + $v1 = COPY %5(s32) + RetRA implicit $v0, implicit $v1 + +... diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctpop.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctpop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctpop.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=mipsel-linux-gnu -global-isel -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32 + +define i32 @ctpop_i32(i32 %a) { +; MIPS32-LABEL: ctpop_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: srl $1, $4, 1 +; MIPS32-NEXT: lui $2, 21845 +; MIPS32-NEXT: ori $2, $2, 21845 +; MIPS32-NEXT: and $1, $1, $2 +; MIPS32-NEXT: subu $1, $4, $1 +; MIPS32-NEXT: srl $2, $1, 2 +; MIPS32-NEXT: lui $3, 13107 +; MIPS32-NEXT: ori $3, $3, 13107 +; MIPS32-NEXT: and $2, $2, $3 +; MIPS32-NEXT: and $1, $1, $3 +; MIPS32-NEXT: addu $1, $2, $1 +; MIPS32-NEXT: srl $2, $1, 4 +; MIPS32-NEXT: addu $1, $2, $1 +; MIPS32-NEXT: lui $2, 3855 +; MIPS32-NEXT: ori $2, $2, 3855 +; MIPS32-NEXT: and $1, $1, $2 +; MIPS32-NEXT: lui $2, 257 +; MIPS32-NEXT: ori $2, $2, 257 +; MIPS32-NEXT: mul $1, $1, $2 +; MIPS32-NEXT: srl $2, $1, 24 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: nop +entry: + %0 = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %0 +} +declare i32 @llvm.ctpop.i32(i32) + + +define i64 @ctpop_i64(i64 %a) { +; MIPS32-LABEL: ctpop_i64: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: srl $1, $4, 1 +; MIPS32-NEXT: lui $2, 21845 +; MIPS32-NEXT: ori $2, $2, 21845 +; MIPS32-NEXT: and $1, $1, $2 +; MIPS32-NEXT: subu $1, $4, $1 +; MIPS32-NEXT: srl $3, $1, 2 +; MIPS32-NEXT: lui $4, 13107 +; MIPS32-NEXT: ori $4, $4, 13107 +; MIPS32-NEXT: and $3, $3, $4 +; MIPS32-NEXT: and $1, $1, $4 +; MIPS32-NEXT: addu $1, $3, $1 +; MIPS32-NEXT: srl $3, $1, 4 +; MIPS32-NEXT: addu $1, $3, $1 +; MIPS32-NEXT: lui $3, 3855 +; MIPS32-NEXT: ori $3, $3, 3855 +; MIPS32-NEXT: and $1, $1, $3 +; MIPS32-NEXT: lui $6, 257 +; MIPS32-NEXT: ori $6, $6, 257 +; MIPS32-NEXT: mul $1, $1, $6 +; MIPS32-NEXT: srl $1, $1, 24 +; MIPS32-NEXT: srl $7, $5, 1 +; MIPS32-NEXT: and $2, $7, $2 +; MIPS32-NEXT: subu $2, $5, $2 +; MIPS32-NEXT: srl $5, $2, 2 +; MIPS32-NEXT: and $5, $5, $4 +; MIPS32-NEXT: and $2, $2, $4 +; MIPS32-NEXT: addu $2, $5, $2 +; MIPS32-NEXT: srl $4, $2, 4 +; MIPS32-NEXT: addu $2, $4, $2 +; MIPS32-NEXT: and $2, $2, $3 +; MIPS32-NEXT: mul $2, $2, $6 +; MIPS32-NEXT: srl $2, $2, 24 +; MIPS32-NEXT: addu $2, $2, $1 +; MIPS32-NEXT: ori $3, $zero, 0 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: nop +entry: + %0 = call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %0 +} +declare i64 @llvm.ctpop.i64(i64)