Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -71,6 +71,10 @@ bool MadeIRChange = false; + // Handle bit counting patterns. + BitCountCombine BCC(TLI, DL); + MadeIRChange |= BCC.run(F); + // Handle TruncInst patterns TruncInstCombine TIC(TLI, DL, DT); MadeIRChange |= TIC.run(F); @@ -87,6 +91,10 @@ auto &DL = F.getParent()->getDataLayout(); bool MadeIRChange = false; + // Handle bit counting patterns. + BitCountCombine BCC(TLI, DL); + MadeIRChange |= BCC.run(F); + // Handle TruncInst patterns TruncInstCombine TIC(TLI, DL, DT); MadeIRChange |= TIC.run(F); Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h @@ -118,4 +118,28 @@ /// \param SclTy scalar version of new type to reduce expression dag into. void ReduceExpressionDag(Type *SclTy); }; + +//===----------------------------------------------------------------------===// +// BitCountCombine - looks for code that does to the population count and +// count leading zeros that corresponds to the typical bit-twiddling algorithms +// from Hacker's Delight. +//===----------------------------------------------------------------------===// + +class BitCountCombine { +public: + BitCountCombine(const TargetLibraryInfo &TLI, const DataLayout &DL) + : TLI(TLI), DL(DL) {} + + bool run(Function &F); + +private: + Value *matchCtpopW(Instruction &In, unsigned BW); + Value *optimizeToCtpop(Instruction &In); + Value *optimizeToCtlz(Instruction &In); + bool runOnBlock(BasicBlock &B); + + const TargetLibraryInfo &TLI; + const DataLayout &DL; +}; + } // end namespace llvm. Index: lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp =================================================================== --- /dev/null +++ lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp @@ -0,0 +1,264 @@ +//===- BitCountCombine.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This code looks for code calculates the population count and the leading +// zeros count using the bit-manipulation methods from Hacker's Delight. +// +// ctpop(n): +// bw = bitwidth(n) +// n = (n & 0x01010101..0101) + (n & 0x10101010..1010 >> 1) +// n = (n & 0x00110011..0011) + (n & 0x11001100..1100 >> 2) +// ... +// n = (n & 0x000000..111111) + (n & 0x111111..000000 >> bw/2) +// return n +// +// ctlz(n): +// bw = bitwidth(n) +// n = n | (n >> 1) +// n = n | (n >> 2) +// n = n | (n >> 4) +// ... +// n = n | (n >> bw/2) +// return bw - ctpop(n) +// +//===----------------------------------------------------------------------===// + +#include "AggressiveInstCombineInternal.h" +#include "llvm/Analysis/Utils/Local.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +// Check if In matches a ctpop calculation pattern for a value of width BW +// bits. If so, return the argument V such that ctpop(V) would be a candidate +// for replacing In. +// The matched pattern is: +// x0 := (V & 0x55..5) + ((V>>1) & 0x55..5) +// x1 := (x0 & 0x33..3) + ((x0>>2) & 0x33..3) +// ... +// xn := (xn-1 & 0x00..0FF..F) + ((xn-1>>S/2) & 0x00..0FF..F) +// where xn is the candidate for ctpop(V). +Value *BitCountCombine::matchCtpopW(Instruction &In, unsigned BW) { + auto matchStep = [] (Value *V, unsigned S, APInt M, bool ShiftAlone) + -> Value* { + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(V, m_Add(m_Value(Op0), m_Value(Op1)))) + return nullptr; + + auto matchAndShift = [S,M,ShiftAlone] (Value *V0, Value *V1) -> Value* { + Value *V = nullptr; + const APInt *P = &M; + auto Mask = m_APInt(P); + auto Shift = m_SpecificInt(S); + + if (!match(V0, m_And(m_Value(V), Mask))) + return nullptr; + if (ShiftAlone) { + if (!match(V1, m_LShr(m_Specific(V), Shift))) + return nullptr; + } else { + if (!match(V1, m_And(m_LShr(m_Specific(V), Shift), Mask))) + return nullptr; + } + return V; + }; + + if (Value *T = matchAndShift(Op0, Op1)) + return T; + if (Value *T = matchAndShift(Op1, Op0)) + return T; + return nullptr; + }; + + // Generate the bitmask for the & operation. BW is the bit-width of the + // entire mask. The masks are: + // 0b01010101..01010101 0x55..55 1 bit every 2 bits + // 0b00110011..00110011 0x33..35 2 bits every 4 bits + // 0b00000111..00000111 0x07..07 3 bits every 8 bits + // ... ... logS bits every S bits + // Normally the masks would be 01010101, 00110011, 00001111, i.e. the + // number of contiguous 1 bits in each group would be twice the number + // in the previous mask, but by the time this code runs, the "demanded" + // bits have been optimized to only require one more 1 bit in each + // subsequent mask. This function generates the post-optimized masks. + auto getMask = [] (unsigned S, unsigned BW) -> APInt { + assert(isPowerOf2_32(S)); + APInt M(BW, S-1); + APInt T(BW, 0); + while (M != 0) { + T |= M; + M <<= S; + } + return T; + }; + + Value *V = &In; + bool SA = true; + unsigned N = BW; + while (N > 1) { + unsigned S = N/2; + V = matchStep(V, S, getMask(N, BW), SA); + if (!V) + return nullptr; + N = S; + SA = false; + } + + return V; +} + +// If In is an expression that evaluates popcnt via shift/add pattern, +// return the equivalent expression using the ctpop intrinsic. Otherwise +// return nullptr. +Value *BitCountCombine::optimizeToCtpop(Instruction &In) { + IntegerType *Ty = dyn_cast(In.getType()); + if (!Ty) + return nullptr; + + // Take the first shift amount feeding the add, and assume this is the + // last shift in the popcnt computation. + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(&In, m_Add(m_Value(Op0), m_Value(Op1)))) + return nullptr; + + // Shift by half-width. + uint64_t SH = 0; + if (!match(Op0, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) && + !match(Op1, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) && + !match(Op0, m_LShr(m_Value(), m_ConstantInt(SH))) && + !match(Op1, m_LShr(m_Value(), m_ConstantInt(SH)))) + return nullptr; + + if (SH < 4 || !isPowerOf2_64(SH)) + return nullptr; + + Value *V = matchCtpopW(In, 2*SH); + if (!V) + return nullptr; + + unsigned TW = Ty->getBitWidth(), BW = 2*SH; + if (BW < TW) { + // BW is the bit width of the expression whose population count is + // being calculated. TW is the bit width of the type associated with + // that expression. Usually they are the same, but for ctpop8 the + // type may be "unsigned", i.e. 32-bit, while the ctpop8 would only + // consider the low 8 bits. In that case BW=8 and TW=32. + KnownBits K(TW); + computeKnownBits(V, K, DL); + APInt Need0 = APInt::getBitsSet(TW, BW, TW); + if ((K.Zero & Need0) != Need0) + return nullptr; + } + + IRBuilder<> Builder(&In); + Module *M = In.getParent()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, {V->getType()}); + CallInst *CI = Builder.CreateCall(Func, {V}); + CI->setDebugLoc(In.getDebugLoc()); + return CI; +} + +Value *BitCountCombine::optimizeToCtlz(Instruction &In) { + // Let bw = bitwidth(n), + // convert + // n = n | (n>>1) + // n = n | (n>>2) + // n = n | (n>>4) + // ... + // n = n | (n>>bw/2) + // bw - ctpop(n) + // to + // ctlz(n). + // This code expects that the ctpop intrinsic has already been generated. + + uint64_t BW = 0; + if (!match(&In, m_Sub(m_ConstantInt(BW), m_Intrinsic()))) + return nullptr; + // Get the argument of the ctpop. + Value *V = cast(In.getOperand(1))->getOperand(0); + + // The argument to ctpop can be zero-extended in some cases. It is safe + // to ignore the zext. + if (auto *Z = dyn_cast(V)) + V = Z->getOperand(0); + + IntegerType *Ty = cast(V->getType()); + if (BW < Ty->getBitWidth()) + return nullptr; + + auto matchOrShift = [] (Value *V, unsigned S) -> Value* { + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(V, m_Or(m_Value(Op0), m_Value(Op1)))) + return nullptr; + if (match(Op0, m_LShr(m_Specific(Op1), m_SpecificInt(S)))) + return Op1; + if (match(Op1, m_LShr(m_Specific(Op0), m_SpecificInt(S)))) + return Op0; + return nullptr; + }; + + unsigned N = BW; + while (N > 1) { + N /= 2; + V = matchOrShift(V, N); + if (!V) + return nullptr; + } + + // The value of BW is the one that determines the type of ctlz's argument. + IRBuilder<> Builder(&In); + if (BW > Ty->getBitWidth()) { + IntegerType *ATy = IntegerType::get(In.getContext(), BW); + V = Builder.CreateZExt(V, ATy); + } + Module *M = In.getParent()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, {V->getType()}); + Value *False = ConstantInt::getFalse(In.getContext()); + CallInst *CI = Builder.CreateCall(Func, {V, False}); + CI->setDebugLoc(In.getDebugLoc()); + if (In.getType() != CI->getType()) + return Builder.CreateZExt(CI, In.getType()); + return CI; +} + +bool BitCountCombine::runOnBlock(BasicBlock &B) { + bool Changed = false; + + // Iterate over the block as long as there are more intrinsics generated. + while (true) { + Value *Int = nullptr; + for (Instruction &In : reverse(B)) { + Int = optimizeToCtpop(In); + if (!Int) + Int = optimizeToCtlz(In); + if (Int) { + Changed = true; + In.replaceAllUsesWith(Int); + RecursivelyDeleteTriviallyDeadInstructions(&In, &TLI); + break; + } + } + if (!Int) + break; + } + + return Changed; +} + +bool BitCountCombine::run(Function &F) { + bool Changed = false; + for (BasicBlock &B : F) + Changed |= runOnBlock(B); + + return Changed; +} Index: lib/Transforms/AggressiveInstCombine/CMakeLists.txt =================================================================== --- lib/Transforms/AggressiveInstCombine/CMakeLists.txt +++ lib/Transforms/AggressiveInstCombine/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMAggressiveInstCombine AggressiveInstCombine.cpp + BitCountCombine.cpp TruncInstCombine.cpp ADDITIONAL_HEADER_DIRS Index: test/Transforms/AggressiveInstCombine/ctlz-combine.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/ctlz-combine.ll @@ -0,0 +1,160 @@ +; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s + +; unsigned ctlz16(unsigned short t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555); +; unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333); +; unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F); +; unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF); +; return 16-t4; +; } +; +; CHECK-LABEL: define i32 @ctlz16 +; CHECK: [[V0:%[a-zA-Z0-9_]+]] = call i16 @llvm.ctlz.i16(i16 %a0, i1 false) +; CHECK: zext i16 [[V0]] to i32 +define i32 @ctlz16(i16 zeroext %a0) local_unnamed_addr #0 { +b0: + %v0 = lshr i16 %a0, 1 + %v1 = or i16 %v0, %a0 + %v2 = lshr i16 %v1, 2 + %v3 = or i16 %v2, %v1 + %v4 = lshr i16 %v3, 4 + %v5 = or i16 %v4, %v3 + %v6 = lshr i16 %v5, 8 + %v7 = or i16 %v6, %v5 + %v8 = zext i16 %v7 to i32 + %v9 = and i32 %v8, 21845 + %v10 = lshr i32 %v8, 1 + %v11 = and i32 %v10, 21845 + %v12 = add nuw nsw i32 %v9, %v11 + %v13 = and i32 %v12, 13107 + %v14 = lshr i32 %v12, 2 + %v15 = and i32 %v14, 13107 + %v16 = add nuw nsw i32 %v13, %v15 + %v17 = and i32 %v16, 1799 + %v18 = lshr i32 %v16, 4 + %v19 = and i32 %v18, 1799 + %v20 = add nuw nsw i32 %v17, %v19 + %v21 = and i32 %v20, 15 + %v22 = lshr i32 %v20, 8 + %v23 = add nuw nsw i32 %v21, %v22 + %v24 = sub nsw i32 16, %v23 + ret i32 %v24 +} + +; unsigned ctlz32(unsigned t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; t0 = t0 | (t0>>16); +; unsigned t1 = (t0 & 0x55555555) + ((t0>>1) & 0x55555555); +; unsigned t2 = (t1 & 0x33333333) + ((t1>>2) & 0x33333333); +; unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4) & 0x0F0F0F0F); +; unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8) & 0x00FF00FF); +; unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF); +; return 32-t5; +; } +; +; CHECK-LABEL: define i32 @ctlz32 +; CHECK: @llvm.ctlz.i32(i32 %a0, i1 false) +define i32 @ctlz32(i32 %a0) local_unnamed_addr #1 { +b0: + %v0 = lshr i32 %a0, 1 + %v1 = or i32 %v0, %a0 + %v2 = lshr i32 %v1, 2 + %v3 = or i32 %v1, %v2 + %v4 = lshr i32 %v3, 4 + %v5 = or i32 %v3, %v4 + %v6 = lshr i32 %v5, 8 + %v7 = or i32 %v5, %v6 + %v8 = lshr i32 %v7, 16 + %v9 = or i32 %v7, %v8 + %v10 = and i32 %v9, 1431655765 + %v11 = lshr i32 %v9, 1 + %v12 = and i32 %v11, 1431655765 + %v13 = add nuw i32 %v10, %v12 + %v14 = and i32 %v13, 858993459 + %v15 = lshr i32 %v13, 2 + %v16 = and i32 %v15, 858993459 + %v17 = add nuw nsw i32 %v14, %v16 + %v18 = and i32 %v17, 117901063 + %v19 = lshr i32 %v17, 4 + %v20 = and i32 %v19, 117901063 + %v21 = add nuw nsw i32 %v18, %v20 + %v22 = and i32 %v21, 983055 + %v23 = lshr i32 %v21, 8 + %v24 = and i32 %v23, 983055 + %v25 = add nuw nsw i32 %v22, %v24 + %v26 = and i32 %v25, 31 + %v27 = lshr i32 %v25, 16 + %v28 = add nuw nsw i32 %v26, %v27 + %v29 = sub nsw i32 32, %v28 + ret i32 %v29 +} + +; typedef unsigned long long u64_t; +; u64_t ctlz64(u64_t t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; t0 = t0 | (t0>>16); +; t0 = t0 | (t0>>32); +; u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1) & 0x5555555555555555LL); +; u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2) & 0x3333333333333333LL); +; u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4) & 0x0F0F0F0F0F0F0F0FLL); +; u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8) & 0x00FF00FF00FF00FFLL); +; u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL); +; u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL); +; return 64-t6; +; } +; +; CHECK-LABEL: define i64 @ctlz64 +; CHECK: @llvm.ctlz.i64(i64 %a0, i1 false) +define i64 @ctlz64(i64 %a0) local_unnamed_addr #1 { +b0: + %v0 = lshr i64 %a0, 1 + %v1 = or i64 %v0, %a0 + %v2 = lshr i64 %v1, 2 + %v3 = or i64 %v1, %v2 + %v4 = lshr i64 %v3, 4 + %v5 = or i64 %v3, %v4 + %v6 = lshr i64 %v5, 8 + %v7 = or i64 %v5, %v6 + %v8 = lshr i64 %v7, 16 + %v9 = or i64 %v7, %v8 + %v10 = lshr i64 %v9, 32 + %v11 = or i64 %v9, %v10 + %v12 = and i64 %v11, 6148914691236517205 + %v13 = lshr i64 %v11, 1 + %v14 = and i64 %v13, 6148914691236517205 + %v15 = add nuw i64 %v12, %v14 + %v16 = and i64 %v15, 3689348814741910323 + %v17 = lshr i64 %v15, 2 + %v18 = and i64 %v17, 3689348814741910323 + %v19 = add nuw nsw i64 %v16, %v18 + %v20 = and i64 %v19, 506381209866536711 + %v21 = lshr i64 %v19, 4 + %v22 = and i64 %v21, 506381209866536711 + %v23 = add nuw nsw i64 %v20, %v22 + %v24 = and i64 %v23, 4222189076152335 + %v25 = lshr i64 %v23, 8 + %v26 = and i64 %v25, 4222189076152335 + %v27 = add nuw nsw i64 %v24, %v26 + %v28 = and i64 %v27, 133143986207 + %v29 = lshr i64 %v27, 16 + %v30 = and i64 %v29, 133143986207 + %v31 = add nuw nsw i64 %v28, %v30 + %v32 = and i64 %v31, 63 + %v33 = lshr i64 %v31, 32 + %v34 = add nuw nsw i64 %v32, %v33 + %v35 = sub nsw i64 64, %v34 + ret i64 %v35 +} + +attributes #0 = { norecurse nounwind readnone uwtable } +attributes #1 = { nounwind uwtable } Index: test/Transforms/AggressiveInstCombine/ctpop-combine.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/ctpop-combine.ll @@ -0,0 +1,139 @@ +; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s + +; unsigned pop8(unsigned char t0) { +; unsigned t1 = (t0 & 0x55) + ((t0>>1) & 0x55); +; unsigned t2 = (t1 & 0x33) + ((t1>>2) & 0x33); +; unsigned t3 = (t2 & 0x0F) + ((t2>>4) & 0x0F); +; return t3; +; } +; +; CHECK-LABEL: define i32 @pop8 +; CHECK: [[ARG8:%[a-zA-Z0-9_]+]] = zext i8 %a0 to i32 +; CHECK: @llvm.ctpop.i32(i32 [[ARG8]]) +define i32 @pop8(i8 zeroext %a0) local_unnamed_addr #0 { +b0: + %v0 = zext i8 %a0 to i32 + %v1 = and i32 %v0, 85 + %v2 = lshr i32 %v0, 1 + %v3 = and i32 %v2, 85 + %v4 = add nuw nsw i32 %v1, %v3 + %v5 = and i32 %v4, 51 + %v6 = lshr i32 %v4, 2 + %v7 = and i32 %v6, 51 + %v8 = add nuw nsw i32 %v5, %v7 + %v9 = and i32 %v8, 7 + %v10 = lshr i32 %v8, 4 + %v11 = add nuw nsw i32 %v9, %v10 + ret i32 %v11 +} + +; unsigned pop16(unsigned short t0) { +; unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555); +; unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333); +; unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F); +; unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF); +; return t4; +; } +; +; CHECK-LABEL: define i32 @pop16 +; CHECK: [[ARG16:%[a-zA-Z0-9_]+]] = zext i16 %a0 to i32 +; CHECK: @llvm.ctpop.i32(i32 [[ARG16]]) +define i32 @pop16(i16 zeroext %a0) local_unnamed_addr #1 { +b0: + %v0 = zext i16 %a0 to i32 + %v1 = and i32 %v0, 21845 + %v2 = lshr i32 %v0, 1 + %v3 = and i32 %v2, 21845 + %v4 = add nuw nsw i32 %v1, %v3 + %v5 = and i32 %v4, 13107 + %v6 = lshr i32 %v4, 2 + %v7 = and i32 %v6, 13107 + %v8 = add nuw nsw i32 %v5, %v7 + %v9 = and i32 %v8, 1799 + %v10 = lshr i32 %v8, 4 + %v11 = and i32 %v10, 1799 + %v12 = add nuw nsw i32 %v9, %v11 + %v13 = and i32 %v12, 15 + %v14 = lshr i32 %v12, 8 + %v15 = add nuw nsw i32 %v13, %v14 + ret i32 %v15 +} + +; unsigned pop32(unsigned t0) { +; unsigned t1 = (t0 & 0x55555555) + ((t0>>1) & 0x55555555); +; unsigned t2 = (t1 & 0x33333333) + ((t1>>2) & 0x33333333); +; unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4) & 0x0F0F0F0F); +; unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8) & 0x00FF00FF); +; unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF); +; return t5; +; } +; +; CHECK-LABEL: define i32 @pop32 +; CHECK: @llvm.ctpop.i32(i32 %a0) +define i32 @pop32(i32 %a0) local_unnamed_addr #1 { +b0: + %v0 = and i32 %a0, 1431655765 + %v1 = lshr i32 %a0, 1 + %v2 = and i32 %v1, 1431655765 + %v3 = add nuw i32 %v0, %v2 + %v4 = and i32 %v3, 858993459 + %v5 = lshr i32 %v3, 2 + %v6 = and i32 %v5, 858993459 + %v7 = add nuw nsw i32 %v4, %v6 + %v8 = and i32 %v7, 117901063 + %v9 = lshr i32 %v7, 4 + %v10 = and i32 %v9, 117901063 + %v11 = add nuw nsw i32 %v8, %v10 + %v12 = and i32 %v11, 983055 + %v13 = lshr i32 %v11, 8 + %v14 = and i32 %v13, 983055 + %v15 = add nuw nsw i32 %v12, %v14 + %v16 = and i32 %v15, 31 + %v17 = lshr i32 %v15, 16 + %v18 = add nuw nsw i32 %v16, %v17 + ret i32 %v18 +} + +; typedef unsigned long long u64_t; +; u64_t pop64(u64_t t0) { +; u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1) & 0x5555555555555555LL); +; u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2) & 0x3333333333333333LL); +; u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4) & 0x0F0F0F0F0F0F0F0FLL); +; u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8) & 0x00FF00FF00FF00FFLL); +; u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL); +; u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL); +; return t6; +; } +; +; CHECK-LABEL: define i64 @pop64 +; CHECK: @llvm.ctpop.i64(i64 %a0) +define i64 @pop64(i64 %a0) local_unnamed_addr #1 { +b0: + %v0 = and i64 %a0, 6148914691236517205 + %v1 = lshr i64 %a0, 1 + %v2 = and i64 %v1, 6148914691236517205 + %v3 = add nuw i64 %v0, %v2 + %v4 = and i64 %v3, 3689348814741910323 + %v5 = lshr i64 %v3, 2 + %v6 = and i64 %v5, 3689348814741910323 + %v7 = add nuw nsw i64 %v4, %v6 + %v8 = and i64 %v7, 506381209866536711 + %v9 = lshr i64 %v7, 4 + %v10 = and i64 %v9, 506381209866536711 + %v11 = add nuw nsw i64 %v8, %v10 + %v12 = and i64 %v11, 4222189076152335 + %v13 = lshr i64 %v11, 8 + %v14 = and i64 %v13, 4222189076152335 + %v15 = add nuw nsw i64 %v12, %v14 + %v16 = and i64 %v15, 133143986207 + %v17 = lshr i64 %v15, 16 + %v18 = and i64 %v17, 133143986207 + %v19 = add nuw nsw i64 %v16, %v18 + %v20 = and i64 %v19, 63 + %v21 = lshr i64 %v19, 32 + %v22 = add nuw nsw i64 %v20, %v21 + ret i64 %v22 +} + +attributes #0 = { norecurse nounwind readnone } +attributes #1 = { nounwind uwtable }