Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1047,10 +1047,140 @@
   return nullptr;
 }
 
+// Check if BI matches a ctpop calculation pattern for a value of width BW
+// bits. If so, return the argument V such that ctpop(V) would be a candidate
+// for replacing BI. If BW is less than the bit-width of the type of V, then
+// BI == ctpop(V) if the bits of V beyond BW are zero.
+// The matched pattern is:
+//   x0 := (V & 0x55..5) + ((V>>1) & 0x55..5)
+//   x1 := (x0 & 0x33..3) + ((x0>>2) & 0x33..3)
+//   ...
+//   xn := (xn-1 & 0x00..0FF..F) + ((xn-1>>S/2) & 0x00..0FF..F)
+// where xn is the candidate for ctpop(V).
+static Value *SimplifyToCtpopW(BinaryOperator *BI, unsigned BW) {
+  auto matchStep = [] (Value *V, unsigned S, APInt M, bool ShiftAlone)
+        -> Value* {
+    Value *Op0 = nullptr, *Op1 = nullptr;
+    if (!match(V, m_Add(m_Value(Op0), m_Value(Op1))))
+      return nullptr;
+
+    auto matchAndShift = [S,M,ShiftAlone] (Value *V0, Value *V1) -> Value* {
+      Value *V = nullptr;
+      const APInt *P = &M;
+      auto Mask = m_APInt(P);
+      auto Shift = m_SpecificInt(S);
+
+      if (!match(V0, m_And(m_Value(V), Mask)))
+        return nullptr;
+      if (ShiftAlone) {
+        if (!match(V1, m_LShr(m_Specific(V), Shift)))
+          return nullptr;
+      } else {
+        if (!match(V1, m_And(m_LShr(m_Specific(V), Shift), Mask)))
+          return nullptr;
+      }
+      return V;
+    };
+
+    if (Value *T = matchAndShift(Op0, Op1))
+      return T;
+    if (Value *T = matchAndShift(Op1, Op0))
+      return T;
+    return nullptr;
+  };
+
+  // Generate the bitmask for the & operation. BW is the bit-width of the
+  // entire mask. The masks are:
+  //   0b01010101..01010101     0x55..55      1 bit  every 2 bits
+  //   0b00110011..00110011     0x33..35      2 bits every 4 bits
+  //   0b00000111..00000111     0x07..07      3 bits every 8 bits
+  //   ...                      ...        logS bits every S bits
+  // Normally the masks would be 01010101, 00110011, 00001111, i.e. the
+  // number of contiguous 1 bits in each group would be twice the number
+  // in the previous mask, but by the time this code runs, the "demanded"
+  // bits have been optimized to only require one more 1 bit in each
+  // subsequent mask. This function generates the post-optimized masks.
+  auto getMask = [] (unsigned S, unsigned BW) -> APInt {
+    assert(isPowerOf2_32(S));
+    APInt M(BW, S-1);
+    APInt T(BW, 0);
+    while (M != 0) {
+      T |= M;
+      M <<= S;
+    }
+    return T;
+  };
+
+  Value *V = BI;
+  bool SA = true;
+  unsigned N = BW;
+  while (N > 1) {
+    unsigned S = N/2;
+    V = matchStep(V, S, getMask(N, BW), SA);
+    if (!V)
+      return nullptr;
+    N = S;
+    SA = false;
+  }
+
+  return V;
+}
+
+static Value *SimplifyToCtpop(BinaryOperator *BI,
+      InstCombiner::BuilderTy *Builder) {
+  IntegerType *Ty = dyn_cast<IntegerType>(BI->getType());
+  if (!Ty)
+    return nullptr;
+
+  // Take the first shift amount feeding the add, and assume this is the
+  // last shift in the popcnt computation.
+  Value *Op0 = nullptr, *Op1 = nullptr;
+  if (!match(BI, m_Add(m_Value(Op0), m_Value(Op1))))
+    return nullptr;
+
+  // Shift by half-width.
+  uint64_t SH = 0;
+  if (!match(Op0, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) &&
+      !match(Op1, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) &&
+      !match(Op0, m_LShr(m_Value(), m_ConstantInt(SH))) &&
+      !match(Op1, m_LShr(m_Value(), m_ConstantInt(SH))))
+    return nullptr;
+
+  if (SH < 4 || !isPowerOf2_64(SH))
+    return nullptr;
+
+  Value *V = SimplifyToCtpopW(BI, 2*SH);
+  if (!V)
+    return nullptr;
+
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+  unsigned TW = Ty->getBitWidth(), BW = 2*SH;
+  if (BW < TW) {
+    // BW is the bit width of the expression whose population count is
+    // being calculated. TW is the bit width of the type associated with
+    // that expression. Usually they are the same, but for ctpop8 the
+    // type may be "unsigned", i.e. 32-bit, while the ctpop8 would only
+    // consider the low 8 bits. In that case BW=8 and TW=32.
+    APInt K0(TW, 0), K1(TW, 0);
+    computeKnownBits(V, K0, K1, M->getDataLayout());
+    APInt Need0 = APInt::getBitsSet(TW, BW, TW);
+    if ((K0 & Need0) != Need0)
+      return nullptr;
+  }
+
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, {V->getType()});
+  CallInst *CI = Builder->CreateCall(Func, {V});
+  CI->setDebugLoc(BI->getDebugLoc());
+  return CI;
+}
+
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
+  if (Value *V = SimplifyToCtpop(&I, Builder))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
@@ -1489,9 +1619,75 @@
   return Builder->CreateIntCast(Result, Ty, true);
 }
 
+static Value *SimplifyToCtlz(BinaryOperator *BI,
+      InstCombiner::BuilderTy *Builder) {
+  // Let bw = bitwidth(n),
+  // convert
+  //   n = n | (n>>1)
+  //   n = n | (n>>2)
+  //   n = n | (n>>4)
+  //   ...
+  //   n = n | (n>>bw/2)
+  //   bw - ctpop(n)
+  // to
+  //   ctlz(n).
+  // This code expects that the ctpop intrinsic has already been generated.
+
+  uint64_t BW = 0;
+  if (!match(BI, m_Sub(m_ConstantInt(BW), m_Intrinsic<Intrinsic::ctpop>())))
+    return nullptr;
+  // Get the argument of the ctpop.
+  Value *V = cast<User>(BI->getOperand(1))->getOperand(0);
+
+  // The argument to ctpop can be zero-extended in some cases. It is safe
+  // to ignore the zext.
+  if (auto *Z = dyn_cast<ZExtInst>(V))
+    V = Z->getOperand(0);
+
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  if (BW < Ty->getBitWidth())
+    return nullptr;
+
+  auto matchOrShift = [] (Value *V, unsigned S) -> Value* {
+    Value *Op0 = nullptr, *Op1 = nullptr;
+    if (!match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+      return nullptr;
+    if (match(Op0, m_LShr(m_Specific(Op1), m_SpecificInt(S))))
+      return Op1;
+    if (match(Op1, m_LShr(m_Specific(Op0), m_SpecificInt(S))))
+      return Op0;
+    return nullptr;
+  };
+
+  unsigned N = BW;
+  while (N > 1) {
+    N /= 2;
+    V = matchOrShift(V, N);
+    if (!V)
+      return nullptr;
+  }
+
+  // The value of BW is the one that determines the type of ctlz's argument.
+  if (BW > Ty->getBitWidth()) {
+    IntegerType *ATy = IntegerType::get(BI->getContext(), BW);
+    V = Builder->CreateZExt(V, ATy);
+  }
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, {V->getType()});
+  Value *False = ConstantInt::getFalse(BI->getContext());
+  CallInst *CI = Builder->CreateCall(Func, {V, False});
+  CI->setDebugLoc(BI->getDebugLoc());
+  if (BI->getType() != CI->getType())
+    return Builder->CreateZExt(CI, BI->getType());
+  return CI;
+}
+
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyToCtlz(&I, Builder))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
Index: test/Transforms/InstCombine/ctlz-match.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/ctlz-match.ll
@@ -0,0 +1,290 @@
+; RUN: opt -early-cse -instcombine -S < %s | FileCheck %s
+
+; unsigned ctlz16(unsigned short t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555);
+;   unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333);
+;   unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F);
+;   unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF);
+;   return 16-t4;
+; }
+;
+; CHECK: define i32 @ctlz16
+; CHECK: ctlz
+define i32 @ctlz16(i16 zeroext %t0) #0 {
+entry:
+  %t0.addr = alloca i16, align 2
+  %t1 = alloca i32, align 4
+  %t2 = alloca i32, align 4
+  %t3 = alloca i32, align 4
+  %t4 = alloca i32, align 4
+  store i16 %t0, i16* %t0.addr, align 2
+  %0 = load i16, i16* %t0.addr, align 2
+  %conv = zext i16 %0 to i32
+  %1 = load i16, i16* %t0.addr, align 2
+  %conv1 = zext i16 %1 to i32
+  %shr = ashr i32 %conv1, 1
+  %or = or i32 %conv, %shr
+  %conv2 = trunc i32 %or to i16
+  store i16 %conv2, i16* %t0.addr, align 2
+  %2 = load i16, i16* %t0.addr, align 2
+  %conv3 = zext i16 %2 to i32
+  %3 = load i16, i16* %t0.addr, align 2
+  %conv4 = zext i16 %3 to i32
+  %shr5 = ashr i32 %conv4, 2
+  %or6 = or i32 %conv3, %shr5
+  %conv7 = trunc i32 %or6 to i16
+  store i16 %conv7, i16* %t0.addr, align 2
+  %4 = load i16, i16* %t0.addr, align 2
+  %conv8 = zext i16 %4 to i32
+  %5 = load i16, i16* %t0.addr, align 2
+  %conv9 = zext i16 %5 to i32
+  %shr10 = ashr i32 %conv9, 4
+  %or11 = or i32 %conv8, %shr10
+  %conv12 = trunc i32 %or11 to i16
+  store i16 %conv12, i16* %t0.addr, align 2
+  %6 = load i16, i16* %t0.addr, align 2
+  %conv13 = zext i16 %6 to i32
+  %7 = load i16, i16* %t0.addr, align 2
+  %conv14 = zext i16 %7 to i32
+  %shr15 = ashr i32 %conv14, 8
+  %or16 = or i32 %conv13, %shr15
+  %conv17 = trunc i32 %or16 to i16
+  store i16 %conv17, i16* %t0.addr, align 2
+  %8 = load i16, i16* %t0.addr, align 2
+  %conv18 = zext i16 %8 to i32
+  %and = and i32 %conv18, 21845
+  %9 = load i16, i16* %t0.addr, align 2
+  %conv19 = zext i16 %9 to i32
+  %shr20 = ashr i32 %conv19, 1
+  %and21 = and i32 %shr20, 21845
+  %add = add nsw i32 %and, %and21
+  store i32 %add, i32* %t1, align 4
+  %10 = load i32, i32* %t1, align 4
+  %and22 = and i32 %10, 13107
+  %11 = load i32, i32* %t1, align 4
+  %shr23 = lshr i32 %11, 2
+  %and24 = and i32 %shr23, 13107
+  %add25 = add i32 %and22, %and24
+  store i32 %add25, i32* %t2, align 4
+  %12 = load i32, i32* %t2, align 4
+  %and26 = and i32 %12, 3855
+  %13 = load i32, i32* %t2, align 4
+  %shr27 = lshr i32 %13, 4
+  %and28 = and i32 %shr27, 3855
+  %add29 = add i32 %and26, %and28
+  store i32 %add29, i32* %t3, align 4
+  %14 = load i32, i32* %t3, align 4
+  %and30 = and i32 %14, 255
+  %15 = load i32, i32* %t3, align 4
+  %shr31 = lshr i32 %15, 8
+  %and32 = and i32 %shr31, 255
+  %add33 = add i32 %and30, %and32
+  store i32 %add33, i32* %t4, align 4
+  %16 = load i32, i32* %t4, align 4
+  %sub = sub i32 16, %16
+  ret i32 %sub
+}
+
+
+; unsigned ctlz32(unsigned t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   t0 = t0 | (t0>>16);
+;   unsigned t1 = (t0 & 0x55555555) + ((t0>>1)  & 0x55555555);
+;   unsigned t2 = (t1 & 0x33333333) + ((t1>>2)  & 0x33333333);
+;   unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4)  & 0x0F0F0F0F);
+;   unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8)  & 0x00FF00FF);
+;   unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF);
+;   return 32-t5;
+; }
+;
+; CHECK: define i32 @ctlz32
+; CHECK: ctlz
+define i32 @ctlz32(i32 %t0) #0 {
+entry:
+  %t0.addr = alloca i32, align 4
+  %t1 = alloca i32, align 4
+  %t2 = alloca i32, align 4
+  %t3 = alloca i32, align 4
+  %t4 = alloca i32, align 4
+  %t5 = alloca i32, align 4
+  store i32 %t0, i32* %t0.addr, align 4
+  %0 = load i32, i32* %t0.addr, align 4
+  %1 = load i32, i32* %t0.addr, align 4
+  %shr = lshr i32 %1, 1
+  %or = or i32 %0, %shr
+  store i32 %or, i32* %t0.addr, align 4
+  %2 = load i32, i32* %t0.addr, align 4
+  %3 = load i32, i32* %t0.addr, align 4
+  %shr1 = lshr i32 %3, 2
+  %or2 = or i32 %2, %shr1
+  store i32 %or2, i32* %t0.addr, align 4
+  %4 = load i32, i32* %t0.addr, align 4
+  %5 = load i32, i32* %t0.addr, align 4
+  %shr3 = lshr i32 %5, 4
+  %or4 = or i32 %4, %shr3
+  store i32 %or4, i32* %t0.addr, align 4
+  %6 = load i32, i32* %t0.addr, align 4
+  %7 = load i32, i32* %t0.addr, align 4
+  %shr5 = lshr i32 %7, 8
+  %or6 = or i32 %6, %shr5
+  store i32 %or6, i32* %t0.addr, align 4
+  %8 = load i32, i32* %t0.addr, align 4
+  %9 = load i32, i32* %t0.addr, align 4
+  %shr7 = lshr i32 %9, 16
+  %or8 = or i32 %8, %shr7
+  store i32 %or8, i32* %t0.addr, align 4
+  %10 = load i32, i32* %t0.addr, align 4
+  %and = and i32 %10, 1431655765
+  %11 = load i32, i32* %t0.addr, align 4
+  %shr9 = lshr i32 %11, 1
+  %and10 = and i32 %shr9, 1431655765
+  %add = add i32 %and, %and10
+  store i32 %add, i32* %t1, align 4
+  %12 = load i32, i32* %t1, align 4
+  %and11 = and i32 %12, 858993459
+  %13 = load i32, i32* %t1, align 4
+  %shr12 = lshr i32 %13, 2
+  %and13 = and i32 %shr12, 858993459
+  %add14 = add i32 %and11, %and13
+  store i32 %add14, i32* %t2, align 4
+  %14 = load i32, i32* %t2, align 4
+  %and15 = and i32 %14, 252645135
+  %15 = load i32, i32* %t2, align 4
+  %shr16 = lshr i32 %15, 4
+  %and17 = and i32 %shr16, 252645135
+  %add18 = add i32 %and15, %and17
+  store i32 %add18, i32* %t3, align 4
+  %16 = load i32, i32* %t3, align 4
+  %and19 = and i32 %16, 16711935
+  %17 = load i32, i32* %t3, align 4
+  %shr20 = lshr i32 %17, 8
+  %and21 = and i32 %shr20, 16711935
+  %add22 = add i32 %and19, %and21
+  store i32 %add22, i32* %t4, align 4
+  %18 = load i32, i32* %t4, align 4
+  %and23 = and i32 %18, 65535
+  %19 = load i32, i32* %t4, align 4
+  %shr24 = lshr i32 %19, 16
+  %and25 = and i32 %shr24, 65535
+  %add26 = add i32 %and23, %and25
+  store i32 %add26, i32* %t5, align 4
+  %20 = load i32, i32* %t5, align 4
+  %sub = sub i32 32, %20
+  ret i32 %sub
+}
+
+
+; typedef unsigned long long u64_t;
+; u64_t ctlz64(u64_t t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   t0 = t0 | (t0>>16);
+;   t0 = t0 | (t0>>32);
+;   u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1)  & 0x5555555555555555LL);
+;   u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2)  & 0x3333333333333333LL);
+;   u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4)  & 0x0F0F0F0F0F0F0F0FLL);
+;   u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8)  & 0x00FF00FF00FF00FFLL);
+;   u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL);
+;   u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL);
+;   return 64-t6;
+; }
+;
+; CHECK: define i64 @ctlz64
+; CHECK: ctlz
+define i64 @ctlz64(i64 %t0) #0 {
+entry:
+  %t0.addr = alloca i64, align 8
+  %t1 = alloca i64, align 8
+  %t2 = alloca i64, align 8
+  %t3 = alloca i64, align 8
+  %t4 = alloca i64, align 8
+  %t5 = alloca i64, align 8
+  %t6 = alloca i64, align 8
+  store i64 %t0, i64* %t0.addr, align 8
+  %0 = load i64, i64* %t0.addr, align 8
+  %1 = load i64, i64* %t0.addr, align 8
+  %shr = lshr i64 %1, 1
+  %or = or i64 %0, %shr
+  store i64 %or, i64* %t0.addr, align 8
+  %2 = load i64, i64* %t0.addr, align 8
+  %3 = load i64, i64* %t0.addr, align 8
+  %shr1 = lshr i64 %3, 2
+  %or2 = or i64 %2, %shr1
+  store i64 %or2, i64* %t0.addr, align 8
+  %4 = load i64, i64* %t0.addr, align 8
+  %5 = load i64, i64* %t0.addr, align 8
+  %shr3 = lshr i64 %5, 4
+  %or4 = or i64 %4, %shr3
+  store i64 %or4, i64* %t0.addr, align 8
+  %6 = load i64, i64* %t0.addr, align 8
+  %7 = load i64, i64* %t0.addr, align 8
+  %shr5 = lshr i64 %7, 8
+  %or6 = or i64 %6, %shr5
+  store i64 %or6, i64* %t0.addr, align 8
+  %8 = load i64, i64* %t0.addr, align 8
+  %9 = load i64, i64* %t0.addr, align 8
+  %shr7 = lshr i64 %9, 16
+  %or8 = or i64 %8, %shr7
+  store i64 %or8, i64* %t0.addr, align 8
+  %10 = load i64, i64* %t0.addr, align 8
+  %11 = load i64, i64* %t0.addr, align 8
+  %shr9 = lshr i64 %11, 32
+  %or10 = or i64 %10, %shr9
+  store i64 %or10, i64* %t0.addr, align 8
+  %12 = load i64, i64* %t0.addr, align 8
+  %and = and i64 %12, 6148914691236517205
+  %13 = load i64, i64* %t0.addr, align 8
+  %shr11 = lshr i64 %13, 1
+  %and12 = and i64 %shr11, 6148914691236517205
+  %add = add i64 %and, %and12
+  store i64 %add, i64* %t1, align 8
+  %14 = load i64, i64* %t1, align 8
+  %and13 = and i64 %14, 3689348814741910323
+  %15 = load i64, i64* %t1, align 8
+  %shr14 = lshr i64 %15, 2
+  %and15 = and i64 %shr14, 3689348814741910323
+  %add16 = add i64 %and13, %and15
+  store i64 %add16, i64* %t2, align 8
+  %16 = load i64, i64* %t2, align 8
+  %and17 = and i64 %16, 1085102592571150095
+  %17 = load i64, i64* %t2, align 8
+  %shr18 = lshr i64 %17, 4
+  %and19 = and i64 %shr18, 1085102592571150095
+  %add20 = add i64 %and17, %and19
+  store i64 %add20, i64* %t3, align 8
+  %18 = load i64, i64* %t3, align 8
+  %and21 = and i64 %18, 71777214294589695
+  %19 = load i64, i64* %t3, align 8
+  %shr22 = lshr i64 %19, 8
+  %and23 = and i64 %shr22, 71777214294589695
+  %add24 = add i64 %and21, %and23
+  store i64 %add24, i64* %t4, align 8
+  %20 = load i64, i64* %t4, align 8
+  %and25 = and i64 %20, 281470681808895
+  %21 = load i64, i64* %t4, align 8
+  %shr26 = lshr i64 %21, 16
+  %and27 = and i64 %shr26, 281470681808895
+  %add28 = add i64 %and25, %and27
+  store i64 %add28, i64* %t5, align 8
+  %22 = load i64, i64* %t5, align 8
+  %and29 = and i64 %22, 4294967295
+  %23 = load i64, i64* %t5, align 8
+  %shr30 = lshr i64 %23, 32
+  %and31 = and i64 %shr30, 4294967295
+  %add32 = add i64 %and29, %and31
+  store i64 %add32, i64* %t6, align 8
+  %24 = load i64, i64* %t6, align 8
+  %sub = sub i64 64, %24
+  ret i64 %sub
+}
+
+attributes #0 = { nounwind }
Index: test/Transforms/InstCombine/ctpop-match.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/ctpop-match.ll
@@ -0,0 +1,229 @@
+; RUN: opt -early-cse -instcombine -S < %s | FileCheck %s
+
+; unsigned pop8(unsigned char t0) {
+;   unsigned t1 = (t0 & 0x55) + ((t0>>1) & 0x55);
+;   unsigned t2 = (t1 & 0x33) + ((t1>>2) & 0x33);
+;   unsigned t3 = (t2 & 0x0F) + ((t2>>4) & 0x0F);
+;   return t3;
+; }
+;
+; CHECK: define i32 @pop8
+; CHECK: ctpop
+define i32 @pop8(i8 zeroext %t0) #0 {
+entry:
+  %t0.addr = alloca i8, align 1
+  %t1 = alloca i32, align 4
+  %t2 = alloca i32, align 4
+  %t3 = alloca i32, align 4
+  store i8 %t0, i8* %t0.addr, align 1
+  %0 = load i8, i8* %t0.addr, align 1
+  %conv = zext i8 %0 to i32
+  %and = and i32 %conv, 85
+  %1 = load i8, i8* %t0.addr, align 1
+  %conv1 = zext i8 %1 to i32
+  %shr = ashr i32 %conv1, 1
+  %and2 = and i32 %shr, 85
+  %add = add nsw i32 %and, %and2
+  store i32 %add, i32* %t1, align 4
+  %2 = load i32, i32* %t1, align 4
+  %and3 = and i32 %2, 51
+  %3 = load i32, i32* %t1, align 4
+  %shr4 = lshr i32 %3, 2
+  %and5 = and i32 %shr4, 51
+  %add6 = add i32 %and3, %and5
+  store i32 %add6, i32* %t2, align 4
+  %4 = load i32, i32* %t2, align 4
+  %and7 = and i32 %4, 15
+  %5 = load i32, i32* %t2, align 4
+  %shr8 = lshr i32 %5, 4
+  %and9 = and i32 %shr8, 15
+  %add10 = add i32 %and7, %and9
+  store i32 %add10, i32* %t3, align 4
+  %6 = load i32, i32* %t3, align 4
+  ret i32 %6
+}
+
+
+; unsigned pop16(unsigned short t0) {
+;   unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555);
+;   unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333);
+;   unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F);
+;   unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF);
+;   return t4;
+; }
+;
+; CHECK: define i32 @pop16
+; CHECK: ctpop
+define i32 @pop16(i16 zeroext %t0) #0 {
+entry:
+  %t0.addr = alloca i16, align 2
+  %t1 = alloca i32, align 4
+  %t2 = alloca i32, align 4
+  %t3 = alloca i32, align 4
+  %t4 = alloca i32, align 4
+  store i16 %t0, i16* %t0.addr, align 2
+  %0 = load i16, i16* %t0.addr, align 2
+  %conv = zext i16 %0 to i32
+  %and = and i32 %conv, 21845
+  %1 = load i16, i16* %t0.addr, align 2
+  %conv1 = zext i16 %1 to i32
+  %shr = ashr i32 %conv1, 1
+  %and2 = and i32 %shr, 21845
+  %add = add nsw i32 %and, %and2
+  store i32 %add, i32* %t1, align 4
+  %2 = load i32, i32* %t1, align 4
+  %and3 = and i32 %2, 13107
+  %3 = load i32, i32* %t1, align 4
+  %shr4 = lshr i32 %3, 2
+  %and5 = and i32 %shr4, 13107
+  %add6 = add i32 %and3, %and5
+  store i32 %add6, i32* %t2, align 4
+  %4 = load i32, i32* %t2, align 4
+  %and7 = and i32 %4, 3855
+  %5 = load i32, i32* %t2, align 4
+  %shr8 = lshr i32 %5, 4
+  %and9 = and i32 %shr8, 3855
+  %add10 = add i32 %and7, %and9
+  store i32 %add10, i32* %t3, align 4
+  %6 = load i32, i32* %t3, align 4
+  %and11 = and i32 %6, 255
+  %7 = load i32, i32* %t3, align 4
+  %shr12 = lshr i32 %7, 8
+  %and13 = and i32 %shr12, 255
+  %add14 = add i32 %and11, %and13
+  store i32 %add14, i32* %t4, align 4
+  %8 = load i32, i32* %t4, align 4
+  ret i32 %8
+}
+
+
+; unsigned pop32(unsigned t0) {
+;   unsigned t1 = (t0 & 0x55555555) + ((t0>>1)  & 0x55555555);
+;   unsigned t2 = (t1 & 0x33333333) + ((t1>>2)  & 0x33333333);
+;   unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4)  & 0x0F0F0F0F);
+;   unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8)  & 0x00FF00FF);
+;   unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF);
+;   return t5;
+; }
+;
+; CHECK: define i32 @pop32
+; CHECK: ctpop
+define i32 @pop32(i32 %t0) #0 {
+entry:
+  %t0.addr = alloca i32, align 4
+  %t1 = alloca i32, align 4
+  %t2 = alloca i32, align 4
+  %t3 = alloca i32, align 4
+  %t4 = alloca i32, align 4
+  %t5 = alloca i32, align 4
+  store i32 %t0, i32* %t0.addr, align 4
+  %0 = load i32, i32* %t0.addr, align 4
+  %and = and i32 %0, 1431655765
+  %1 = load i32, i32* %t0.addr, align 4
+  %shr = lshr i32 %1, 1
+  %and1 = and i32 %shr, 1431655765
+  %add = add i32 %and, %and1
+  store i32 %add, i32* %t1, align 4
+  %2 = load i32, i32* %t1, align 4
+  %and2 = and i32 %2, 858993459
+  %3 = load i32, i32* %t1, align 4
+  %shr3 = lshr i32 %3, 2
+  %and4 = and i32 %shr3, 858993459
+  %add5 = add i32 %and2, %and4
+  store i32 %add5, i32* %t2, align 4
+  %4 = load i32, i32* %t2, align 4
+  %and6 = and i32 %4, 252645135
+  %5 = load i32, i32* %t2, align 4
+  %shr7 = lshr i32 %5, 4
+  %and8 = and i32 %shr7, 252645135
+  %add9 = add i32 %and6, %and8
+  store i32 %add9, i32* %t3, align 4
+  %6 = load i32, i32* %t3, align 4
+  %and10 = and i32 %6, 16711935
+  %7 = load i32, i32* %t3, align 4
+  %shr11 = lshr i32 %7, 8
+  %and12 = and i32 %shr11, 16711935
+  %add13 = add i32 %and10, %and12
+  store i32 %add13, i32* %t4, align 4
+  %8 = load i32, i32* %t4, align 4
+  %and14 = and i32 %8, 65535
+  %9 = load i32, i32* %t4, align 4
+  %shr15 = lshr i32 %9, 16
+  %and16 = and i32 %shr15, 65535
+  %add17 = add i32 %and14, %and16
+  store i32 %add17, i32* %t5, align 4
+  %10 = load i32, i32* %t5, align 4
+  ret i32 %10
+}
+
+
+; typedef unsigned long long u64_t;
+; u64_t pop64(u64_t t0) {
+;   u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1)  & 0x5555555555555555LL);
+;   u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2)  & 0x3333333333333333LL);
+;   u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4)  & 0x0F0F0F0F0F0F0F0FLL);
+;   u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8)  & 0x00FF00FF00FF00FFLL);
+;   u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL);
+;   u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL);
+;   return t6;
+; }
+;
+; CHECK: define i64 @pop64
+; CHECK: ctpop
+define i64 @pop64(i64 %t0) #0 {
+entry:
+  %t0.addr = alloca i64, align 8
+  %t1 = alloca i64, align 8
+  %t2 = alloca i64, align 8
+  %t3 = alloca i64, align 8
+  %t4 = alloca i64, align 8
+  %t5 = alloca i64, align 8
+  %t6 = alloca i64, align 8
+  store i64 %t0, i64* %t0.addr, align 8
+  %0 = load i64, i64* %t0.addr, align 8
+  %and = and i64 %0, 6148914691236517205
+  %1 = load i64, i64* %t0.addr, align 8
+  %shr = lshr i64 %1, 1
+  %and1 = and i64 %shr, 6148914691236517205
+  %add = add i64 %and, %and1
+  store i64 %add, i64* %t1, align 8
+  %2 = load i64, i64* %t1, align 8
+  %and2 = and i64 %2, 3689348814741910323
+  %3 = load i64, i64* %t1, align 8
+  %shr3 = lshr i64 %3, 2
+  %and4 = and i64 %shr3, 3689348814741910323
+  %add5 = add i64 %and2, %and4
+  store i64 %add5, i64* %t2, align 8
+  %4 = load i64, i64* %t2, align 8
+  %and6 = and i64 %4, 1085102592571150095
+  %5 = load i64, i64* %t2, align 8
+  %shr7 = lshr i64 %5, 4
+  %and8 = and i64 %shr7, 1085102592571150095
+  %add9 = add i64 %and6, %and8
+  store i64 %add9, i64* %t3, align 8
+  %6 = load i64, i64* %t3, align 8
+  %and10 = and i64 %6, 71777214294589695
+  %7 = load i64, i64* %t3, align 8
+  %shr11 = lshr i64 %7, 8
+  %and12 = and i64 %shr11, 71777214294589695
+  %add13 = add i64 %and10, %and12
+  store i64 %add13, i64* %t4, align 8
+  %8 = load i64, i64* %t4, align 8
+  %and14 = and i64 %8, 281470681808895
+  %9 = load i64, i64* %t4, align 8
+  %shr15 = lshr i64 %9, 16
+  %and16 = and i64 %shr15, 281470681808895
+  %add17 = add i64 %and14, %and16
+  store i64 %add17, i64* %t5, align 8
+  %10 = load i64, i64* %t5, align 8
+  %and18 = and i64 %10, 4294967295
+  %11 = load i64, i64* %t5, align 8
+  %shr19 = lshr i64 %11, 32
+  %and20 = and i64 %shr19, 4294967295
+  %add21 = add i64 %and18, %and20
+  store i64 %add21, i64* %t6, align 8
+  %12 = load i64, i64* %t6, align 8
+  ret i64 %12
+}
+
+attributes #0 = { nounwind }