Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
===================================================================
--- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -71,6 +71,10 @@
 
   bool MadeIRChange = false;
 
+  // Handle bit counting patterns.
+  BitCountCombine BCC(TLI, DL);
+  MadeIRChange |= BCC.run(F);
+
   // Handle TruncInst patterns
   TruncInstCombine TIC(TLI, DL, DT);
   MadeIRChange |= TIC.run(F);
@@ -87,6 +91,10 @@
   auto &DL = F.getParent()->getDataLayout();
   bool MadeIRChange = false;
 
+  // Handle bit counting patterns.
+  BitCountCombine BCC(TLI, DL);
+  MadeIRChange |= BCC.run(F);
+
   // Handle TruncInst patterns
   TruncInstCombine TIC(TLI, DL, DT);
   MadeIRChange |= TIC.run(F);
Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
===================================================================
--- lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -118,4 +118,28 @@
   /// \param SclTy scalar version of new type to reduce expression dag into.
   void ReduceExpressionDag(Type *SclTy);
 };
+
+//===----------------------------------------------------------------------===//
+// BitCountCombine - looks for code that does to the population count and
+// count leading zeros that corresponds to the typical bit-twiddling algorithms
+// from Hacker's Delight.
+//===----------------------------------------------------------------------===//
+
+class BitCountCombine {
+public:
+  BitCountCombine(const TargetLibraryInfo &TLI, const DataLayout &DL)
+      : TLI(TLI), DL(DL) {}
+
+  bool run(Function &F);
+
+private:
+  Value *matchCtpopW(Instruction &In, unsigned BW);
+  Value *optimizeToCtpop(Instruction &In);
+  Value *optimizeToCtlz(Instruction &In);
+  bool runOnBlock(BasicBlock &B);
+
+  const TargetLibraryInfo &TLI;
+  const DataLayout &DL;
+};
+
 } // end namespace llvm.
Index: lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp
@@ -0,0 +1,264 @@
+//===- BitCountCombine.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This code looks for code calculates the population count and the leading
+// zeros count using the bit-manipulation methods from Hacker's Delight.
+//
+// ctpop(n):
+//   bw = bitwidth(n)
+//   n = (n & 0x01010101..0101) + (n & 0x10101010..1010 >> 1)
+//   n = (n & 0x00110011..0011) + (n & 0x11001100..1100 >> 2)
+//   ...
+//   n = (n & 0x000000..111111) + (n & 0x111111..000000 >> bw/2)
+//   return n
+//
+// ctlz(n):
+//   bw = bitwidth(n)
+//   n = n | (n >> 1)
+//   n = n | (n >> 2)
+//   n = n | (n >> 4)
+//   ...
+//   n = n | (n >> bw/2)
+//   return bw - ctpop(n)
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveInstCombineInternal.h"
+#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+// Check if In matches a ctpop calculation pattern for a value of width BW
+// bits. If so, return the argument V such that ctpop(V) would be a candidate
+// for replacing In.
+// The matched pattern is:
+//   x0 := (V & 0x55..5) + ((V>>1) & 0x55..5)
+//   x1 := (x0 & 0x33..3) + ((x0>>2) & 0x33..3)
+//   ...
+//   xn := (xn-1 & 0x00..0FF..F) + ((xn-1>>S/2) & 0x00..0FF..F)
+// where xn is the candidate for ctpop(V).
+Value *BitCountCombine::matchCtpopW(Instruction &In, unsigned BW) {
+  auto matchStep = [] (Value *V, unsigned S, APInt M, bool ShiftAlone)
+        -> Value* {
+    Value *Op0 = nullptr, *Op1 = nullptr;
+    if (!match(V, m_Add(m_Value(Op0), m_Value(Op1))))
+      return nullptr;
+
+    auto matchAndShift = [S,M,ShiftAlone] (Value *V0, Value *V1) -> Value* {
+      Value *V = nullptr;
+      const APInt *P = &M;
+      auto Mask = m_APInt(P);
+      auto Shift = m_SpecificInt(S);
+
+      if (!match(V0, m_And(m_Value(V), Mask)))
+        return nullptr;
+      if (ShiftAlone) {
+        if (!match(V1, m_LShr(m_Specific(V), Shift)))
+          return nullptr;
+      } else {
+        if (!match(V1, m_And(m_LShr(m_Specific(V), Shift), Mask)))
+          return nullptr;
+      }
+      return V;
+    };
+
+    if (Value *T = matchAndShift(Op0, Op1))
+      return T;
+    if (Value *T = matchAndShift(Op1, Op0))
+      return T;
+    return nullptr;
+  };
+
+  // Generate the bitmask for the & operation. BW is the bit-width of the
+  // entire mask. The masks are:
+  //   0b01010101..01010101     0x55..55      1 bit  every 2 bits
+  //   0b00110011..00110011     0x33..35      2 bits every 4 bits
+  //   0b00000111..00000111     0x07..07      3 bits every 8 bits
+  //   ...                      ...        logS bits every S bits
+  // Normally the masks would be 01010101, 00110011, 00001111, i.e. the
+  // number of contiguous 1 bits in each group would be twice the number
+  // in the previous mask, but by the time this code runs, the "demanded"
+  // bits have been optimized to only require one more 1 bit in each
+  // subsequent mask. This function generates the post-optimized masks.
+  auto getMask = [] (unsigned S, unsigned BW) -> APInt {
+    assert(isPowerOf2_32(S));
+    APInt M(BW, S-1);
+    APInt T(BW, 0);
+    while (M != 0) {
+      T |= M;
+      M <<= S;
+    }
+    return T;
+  };
+
+  Value *V = &In;
+  bool SA = true;
+  unsigned N = BW;
+  while (N > 1) {
+    unsigned S = N/2;
+    V = matchStep(V, S, getMask(N, BW), SA);
+    if (!V)
+      return nullptr;
+    N = S;
+    SA = false;
+  }
+
+  return V;
+}
+
+// If In is an expression that evaluates popcnt via shift/add pattern,
+// return the equivalent expression using the ctpop intrinsic. Otherwise
+// return nullptr.
+Value *BitCountCombine::optimizeToCtpop(Instruction &In) {
+  IntegerType *Ty = dyn_cast<IntegerType>(In.getType());
+  if (!Ty)
+    return nullptr;
+
+  // Take the first shift amount feeding the add, and assume this is the
+  // last shift in the popcnt computation.
+  Value *Op0 = nullptr, *Op1 = nullptr;
+  if (!match(&In, m_Add(m_Value(Op0), m_Value(Op1))))
+    return nullptr;
+
+  // Shift by half-width.
+  uint64_t SH = 0;
+  if (!match(Op0, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) &&
+      !match(Op1, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) &&
+      !match(Op0, m_LShr(m_Value(), m_ConstantInt(SH))) &&
+      !match(Op1, m_LShr(m_Value(), m_ConstantInt(SH))))
+    return nullptr;
+
+  if (SH < 4 || !isPowerOf2_64(SH))
+    return nullptr;
+
+  Value *V = matchCtpopW(In, 2*SH);
+  if (!V)
+    return nullptr;
+
+  unsigned TW = Ty->getBitWidth(), BW = 2*SH;
+  if (BW < TW) {
+    // BW is the bit width of the expression whose population count is
+    // being calculated. TW is the bit width of the type associated with
+    // that expression. Usually they are the same, but for ctpop8 the
+    // type may be "unsigned", i.e. 32-bit, while the ctpop8 would only
+    // consider the low 8 bits. In that case BW=8 and TW=32.
+    KnownBits K(TW);
+    computeKnownBits(V, K, DL);
+    APInt Need0 = APInt::getBitsSet(TW, BW, TW);
+    if ((K.Zero & Need0) != Need0)
+      return nullptr;
+  }
+
+  IRBuilder<> Builder(&In);
+  Module *M = In.getParent()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, {V->getType()});
+  CallInst *CI = Builder.CreateCall(Func, {V});
+  CI->setDebugLoc(In.getDebugLoc());
+  return CI;
+}
+
+Value *BitCountCombine::optimizeToCtlz(Instruction &In) {
+  // Let bw = bitwidth(n),
+  // convert
+  //   n = n | (n>>1)
+  //   n = n | (n>>2)
+  //   n = n | (n>>4)
+  //   ...
+  //   n = n | (n>>bw/2)
+  //   bw - ctpop(n)
+  // to
+  //   ctlz(n).
+  // This code expects that the ctpop intrinsic has already been generated.
+
+  uint64_t BW = 0;
+  if (!match(&In, m_Sub(m_ConstantInt(BW), m_Intrinsic<Intrinsic::ctpop>())))
+    return nullptr;
+  // Get the argument of the ctpop.
+  Value *V = cast<User>(In.getOperand(1))->getOperand(0);
+
+  // The argument to ctpop can be zero-extended in some cases. It is safe
+  // to ignore the zext.
+  if (auto *Z = dyn_cast<ZExtInst>(V))
+    V = Z->getOperand(0);
+
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  if (BW < Ty->getBitWidth())
+    return nullptr;
+
+  auto matchOrShift = [] (Value *V, unsigned S) -> Value* {
+    Value *Op0 = nullptr, *Op1 = nullptr;
+    if (!match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+      return nullptr;
+    if (match(Op0, m_LShr(m_Specific(Op1), m_SpecificInt(S))))
+      return Op1;
+    if (match(Op1, m_LShr(m_Specific(Op0), m_SpecificInt(S))))
+      return Op0;
+    return nullptr;
+  };
+
+  unsigned N = BW;
+  while (N > 1) {
+    N /= 2;
+    V = matchOrShift(V, N);
+    if (!V)
+      return nullptr;
+  }
+
+  // The value of BW is the one that determines the type of ctlz's argument.
+  IRBuilder<> Builder(&In);
+  if (BW > Ty->getBitWidth()) {
+    IntegerType *ATy = IntegerType::get(In.getContext(), BW);
+    V = Builder.CreateZExt(V, ATy);
+  }
+  Module *M = In.getParent()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, {V->getType()});
+  Value *False = ConstantInt::getFalse(In.getContext());
+  CallInst *CI = Builder.CreateCall(Func, {V, False});
+  CI->setDebugLoc(In.getDebugLoc());
+  if (In.getType() != CI->getType())
+    return Builder.CreateZExt(CI, In.getType());
+  return CI;
+}
+
+bool BitCountCombine::runOnBlock(BasicBlock &B) {
+  bool Changed = false;
+
+  // Iterate over the block as long as there are more intrinsics generated.
+  while (true) {
+    Value *Int = nullptr;
+    for (Instruction &In : reverse(B)) {
+      Int = optimizeToCtpop(In);
+      if (!Int)
+        Int = optimizeToCtlz(In);
+      if (Int) {
+        Changed = true;
+        In.replaceAllUsesWith(Int);
+        RecursivelyDeleteTriviallyDeadInstructions(&In, &TLI);
+        break;
+      }
+    }
+    if (!Int)
+      break;
+  }
+
+  return Changed;
+}
+
+bool BitCountCombine::run(Function &F) {
+  bool Changed = false;
+  for (BasicBlock &B : F)
+    Changed |= runOnBlock(B);
+
+  return Changed;
+}
Index: lib/Transforms/AggressiveInstCombine/CMakeLists.txt
===================================================================
--- lib/Transforms/AggressiveInstCombine/CMakeLists.txt
+++ lib/Transforms/AggressiveInstCombine/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMAggressiveInstCombine
   AggressiveInstCombine.cpp
+  BitCountCombine.cpp
   TruncInstCombine.cpp
 
   ADDITIONAL_HEADER_DIRS
Index: test/Transforms/AggressiveInstCombine/ctlz-combine.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/ctlz-combine.ll
@@ -0,0 +1,160 @@
+; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s
+
+; unsigned ctlz16(unsigned short t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555);
+;   unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333);
+;   unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F);
+;   unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF);
+;   return 16-t4;
+; }
+;
+; CHECK-LABEL: define i32 @ctlz16
+; CHECK: [[V0:%[a-zA-Z0-9_]+]] = call i16 @llvm.ctlz.i16(i16 %a0, i1 false)
+; CHECK: zext i16 [[V0]] to i32
+define i32 @ctlz16(i16 zeroext %a0) local_unnamed_addr #0 {
+b0:
+  %v0 = lshr i16 %a0, 1
+  %v1 = or i16 %v0, %a0
+  %v2 = lshr i16 %v1, 2
+  %v3 = or i16 %v2, %v1
+  %v4 = lshr i16 %v3, 4
+  %v5 = or i16 %v4, %v3
+  %v6 = lshr i16 %v5, 8
+  %v7 = or i16 %v6, %v5
+  %v8 = zext i16 %v7 to i32
+  %v9 = and i32 %v8, 21845
+  %v10 = lshr i32 %v8, 1
+  %v11 = and i32 %v10, 21845
+  %v12 = add nuw nsw i32 %v9, %v11
+  %v13 = and i32 %v12, 13107
+  %v14 = lshr i32 %v12, 2
+  %v15 = and i32 %v14, 13107
+  %v16 = add nuw nsw i32 %v13, %v15
+  %v17 = and i32 %v16, 1799
+  %v18 = lshr i32 %v16, 4
+  %v19 = and i32 %v18, 1799
+  %v20 = add nuw nsw i32 %v17, %v19
+  %v21 = and i32 %v20, 15
+  %v22 = lshr i32 %v20, 8
+  %v23 = add nuw nsw i32 %v21, %v22
+  %v24 = sub nsw i32 16, %v23
+  ret i32 %v24
+}
+
+; unsigned ctlz32(unsigned t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   t0 = t0 | (t0>>16);
+;   unsigned t1 = (t0 & 0x55555555) + ((t0>>1)  & 0x55555555);
+;   unsigned t2 = (t1 & 0x33333333) + ((t1>>2)  & 0x33333333);
+;   unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4)  & 0x0F0F0F0F);
+;   unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8)  & 0x00FF00FF);
+;   unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF);
+;   return 32-t5;
+; }
+;
+; CHECK-LABEL: define i32 @ctlz32
+; CHECK: @llvm.ctlz.i32(i32 %a0, i1 false)
+define i32 @ctlz32(i32 %a0) local_unnamed_addr #1 {
+b0:
+  %v0 = lshr i32 %a0, 1
+  %v1 = or i32 %v0, %a0
+  %v2 = lshr i32 %v1, 2
+  %v3 = or i32 %v1, %v2
+  %v4 = lshr i32 %v3, 4
+  %v5 = or i32 %v3, %v4
+  %v6 = lshr i32 %v5, 8
+  %v7 = or i32 %v5, %v6
+  %v8 = lshr i32 %v7, 16
+  %v9 = or i32 %v7, %v8
+  %v10 = and i32 %v9, 1431655765
+  %v11 = lshr i32 %v9, 1
+  %v12 = and i32 %v11, 1431655765
+  %v13 = add nuw i32 %v10, %v12
+  %v14 = and i32 %v13, 858993459
+  %v15 = lshr i32 %v13, 2
+  %v16 = and i32 %v15, 858993459
+  %v17 = add nuw nsw i32 %v14, %v16
+  %v18 = and i32 %v17, 117901063
+  %v19 = lshr i32 %v17, 4
+  %v20 = and i32 %v19, 117901063
+  %v21 = add nuw nsw i32 %v18, %v20
+  %v22 = and i32 %v21, 983055
+  %v23 = lshr i32 %v21, 8
+  %v24 = and i32 %v23, 983055
+  %v25 = add nuw nsw i32 %v22, %v24
+  %v26 = and i32 %v25, 31
+  %v27 = lshr i32 %v25, 16
+  %v28 = add nuw nsw i32 %v26, %v27
+  %v29 = sub nsw i32 32, %v28
+  ret i32 %v29
+}
+
+; typedef unsigned long long u64_t;
+; u64_t ctlz64(u64_t t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   t0 = t0 | (t0>>16);
+;   t0 = t0 | (t0>>32);
+;   u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1)  & 0x5555555555555555LL);
+;   u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2)  & 0x3333333333333333LL);
+;   u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4)  & 0x0F0F0F0F0F0F0F0FLL);
+;   u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8)  & 0x00FF00FF00FF00FFLL);
+;   u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL);
+;   u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL);
+;   return 64-t6;
+; }
+;
+; CHECK-LABEL: define i64 @ctlz64
+; CHECK: @llvm.ctlz.i64(i64 %a0, i1 false)
+define i64 @ctlz64(i64 %a0) local_unnamed_addr #1 {
+b0:
+  %v0 = lshr i64 %a0, 1
+  %v1 = or i64 %v0, %a0
+  %v2 = lshr i64 %v1, 2
+  %v3 = or i64 %v1, %v2
+  %v4 = lshr i64 %v3, 4
+  %v5 = or i64 %v3, %v4
+  %v6 = lshr i64 %v5, 8
+  %v7 = or i64 %v5, %v6
+  %v8 = lshr i64 %v7, 16
+  %v9 = or i64 %v7, %v8
+  %v10 = lshr i64 %v9, 32
+  %v11 = or i64 %v9, %v10
+  %v12 = and i64 %v11, 6148914691236517205
+  %v13 = lshr i64 %v11, 1
+  %v14 = and i64 %v13, 6148914691236517205
+  %v15 = add nuw i64 %v12, %v14
+  %v16 = and i64 %v15, 3689348814741910323
+  %v17 = lshr i64 %v15, 2
+  %v18 = and i64 %v17, 3689348814741910323
+  %v19 = add nuw nsw i64 %v16, %v18
+  %v20 = and i64 %v19, 506381209866536711
+  %v21 = lshr i64 %v19, 4
+  %v22 = and i64 %v21, 506381209866536711
+  %v23 = add nuw nsw i64 %v20, %v22
+  %v24 = and i64 %v23, 4222189076152335
+  %v25 = lshr i64 %v23, 8
+  %v26 = and i64 %v25, 4222189076152335
+  %v27 = add nuw nsw i64 %v24, %v26
+  %v28 = and i64 %v27, 133143986207
+  %v29 = lshr i64 %v27, 16
+  %v30 = and i64 %v29, 133143986207
+  %v31 = add nuw nsw i64 %v28, %v30
+  %v32 = and i64 %v31, 63
+  %v33 = lshr i64 %v31, 32
+  %v34 = add nuw nsw i64 %v32, %v33
+  %v35 = sub nsw i64 64, %v34
+  ret i64 %v35
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable }
+attributes #1 = { nounwind uwtable }
Index: test/Transforms/AggressiveInstCombine/ctpop-combine.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/ctpop-combine.ll
@@ -0,0 +1,139 @@
+; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s
+
+; unsigned pop8(unsigned char t0) {
+;   unsigned t1 = (t0 & 0x55) + ((t0>>1) & 0x55);
+;   unsigned t2 = (t1 & 0x33) + ((t1>>2) & 0x33);
+;   unsigned t3 = (t2 & 0x0F) + ((t2>>4) & 0x0F);
+;   return t3;
+; }
+;
+; CHECK-LABEL: define i32 @pop8
+; CHECK: [[ARG8:%[a-zA-Z0-9_]+]] = zext i8 %a0 to i32
+; CHECK: @llvm.ctpop.i32(i32 [[ARG8]])
+define i32 @pop8(i8 zeroext %a0) local_unnamed_addr #0 {
+b0:
+  %v0 = zext i8 %a0 to i32
+  %v1 = and i32 %v0, 85
+  %v2 = lshr i32 %v0, 1
+  %v3 = and i32 %v2, 85
+  %v4 = add nuw nsw i32 %v1, %v3
+  %v5 = and i32 %v4, 51
+  %v6 = lshr i32 %v4, 2
+  %v7 = and i32 %v6, 51
+  %v8 = add nuw nsw i32 %v5, %v7
+  %v9 = and i32 %v8, 7
+  %v10 = lshr i32 %v8, 4
+  %v11 = add nuw nsw i32 %v9, %v10
+  ret i32 %v11
+}
+
+; unsigned pop16(unsigned short t0) {
+;   unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555);
+;   unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333);
+;   unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F);
+;   unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF);
+;   return t4;
+; }
+;
+; CHECK-LABEL: define i32 @pop16
+; CHECK: [[ARG16:%[a-zA-Z0-9_]+]] = zext i16 %a0 to i32
+; CHECK: @llvm.ctpop.i32(i32 [[ARG16]])
+define i32 @pop16(i16 zeroext %a0) local_unnamed_addr #1 {
+b0:
+  %v0 = zext i16 %a0 to i32
+  %v1 = and i32 %v0, 21845
+  %v2 = lshr i32 %v0, 1
+  %v3 = and i32 %v2, 21845
+  %v4 = add nuw nsw i32 %v1, %v3
+  %v5 = and i32 %v4, 13107
+  %v6 = lshr i32 %v4, 2
+  %v7 = and i32 %v6, 13107
+  %v8 = add nuw nsw i32 %v5, %v7
+  %v9 = and i32 %v8, 1799
+  %v10 = lshr i32 %v8, 4
+  %v11 = and i32 %v10, 1799
+  %v12 = add nuw nsw i32 %v9, %v11
+  %v13 = and i32 %v12, 15
+  %v14 = lshr i32 %v12, 8
+  %v15 = add nuw nsw i32 %v13, %v14
+  ret i32 %v15
+}
+
+; unsigned pop32(unsigned t0) {
+;   unsigned t1 = (t0 & 0x55555555) + ((t0>>1)  & 0x55555555);
+;   unsigned t2 = (t1 & 0x33333333) + ((t1>>2)  & 0x33333333);
+;   unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4)  & 0x0F0F0F0F);
+;   unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8)  & 0x00FF00FF);
+;   unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF);
+;   return t5;
+; }
+;
+; CHECK-LABEL: define i32 @pop32
+; CHECK: @llvm.ctpop.i32(i32 %a0)
+define i32 @pop32(i32 %a0) local_unnamed_addr #1 {
+b0:
+  %v0 = and i32 %a0, 1431655765
+  %v1 = lshr i32 %a0, 1
+  %v2 = and i32 %v1, 1431655765
+  %v3 = add nuw i32 %v0, %v2
+  %v4 = and i32 %v3, 858993459
+  %v5 = lshr i32 %v3, 2
+  %v6 = and i32 %v5, 858993459
+  %v7 = add nuw nsw i32 %v4, %v6
+  %v8 = and i32 %v7, 117901063
+  %v9 = lshr i32 %v7, 4
+  %v10 = and i32 %v9, 117901063
+  %v11 = add nuw nsw i32 %v8, %v10
+  %v12 = and i32 %v11, 983055
+  %v13 = lshr i32 %v11, 8
+  %v14 = and i32 %v13, 983055
+  %v15 = add nuw nsw i32 %v12, %v14
+  %v16 = and i32 %v15, 31
+  %v17 = lshr i32 %v15, 16
+  %v18 = add nuw nsw i32 %v16, %v17
+  ret i32 %v18
+}
+
+; typedef unsigned long long u64_t;
+; u64_t pop64(u64_t t0) {
+;   u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1)  & 0x5555555555555555LL);
+;   u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2)  & 0x3333333333333333LL);
+;   u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4)  & 0x0F0F0F0F0F0F0F0FLL);
+;   u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8)  & 0x00FF00FF00FF00FFLL);
+;   u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL);
+;   u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL);
+;   return t6;
+; }
+;
+; CHECK-LABEL: define i64 @pop64
+; CHECK: @llvm.ctpop.i64(i64 %a0)
+define i64 @pop64(i64 %a0) local_unnamed_addr #1 {
+b0:
+  %v0 = and i64 %a0, 6148914691236517205
+  %v1 = lshr i64 %a0, 1
+  %v2 = and i64 %v1, 6148914691236517205
+  %v3 = add nuw i64 %v0, %v2
+  %v4 = and i64 %v3, 3689348814741910323
+  %v5 = lshr i64 %v3, 2
+  %v6 = and i64 %v5, 3689348814741910323
+  %v7 = add nuw nsw i64 %v4, %v6
+  %v8 = and i64 %v7, 506381209866536711
+  %v9 = lshr i64 %v7, 4
+  %v10 = and i64 %v9, 506381209866536711
+  %v11 = add nuw nsw i64 %v8, %v10
+  %v12 = and i64 %v11, 4222189076152335
+  %v13 = lshr i64 %v11, 8
+  %v14 = and i64 %v13, 4222189076152335
+  %v15 = add nuw nsw i64 %v12, %v14
+  %v16 = and i64 %v15, 133143986207
+  %v17 = lshr i64 %v15, 16
+  %v18 = and i64 %v17, 133143986207
+  %v19 = add nuw nsw i64 %v16, %v18
+  %v20 = and i64 %v19, 63
+  %v21 = lshr i64 %v19, 32
+  %v22 = add nuw nsw i64 %v20, %v21
+  ret i64 %v22
+}
+
+attributes #0 = { norecurse nounwind readnone }
+attributes #1 = { nounwind uwtable }