Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
===================================================================
--- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -71,6 +71,10 @@
 
   bool MadeIRChange = false;
 
+  // Handle bit counting patterns.
+  BitCountCombine BCC(TLI, DL);
+  MadeIRChange |= BCC.run(F);
+
   // Handle TruncInst patterns
   TruncInstCombine TIC(TLI, DL, DT);
   MadeIRChange |= TIC.run(F);
@@ -87,6 +91,10 @@
   auto &DL = F.getParent()->getDataLayout();
   bool MadeIRChange = false;
 
+  // Handle bit counting patterns.
+  BitCountCombine BCC(TLI, DL);
+  MadeIRChange |= BCC.run(F);
+
   // Handle TruncInst patterns
   TruncInstCombine TIC(TLI, DL, DT);
   MadeIRChange |= TIC.run(F);
Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
===================================================================
--- lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -118,4 +118,28 @@
   /// \param SclTy scalar version of new type to reduce expression dag into.
   void ReduceExpressionDag(Type *SclTy);
 };
+
+//===----------------------------------------------------------------------===//
+// BitCountCombine - looks for code that does to the population count and
+// count leading zeros that corresponds to the typical bit-twiddling algorithms
+// from Hacker's Delight.
+//===----------------------------------------------------------------------===//
+
+class BitCountCombine {
+public:
+  BitCountCombine(const TargetLibraryInfo &TLI, const DataLayout &DL)
+      : TLI(TLI), DL(DL) {}
+
+  bool run(Function &F);
+
+private:
+  Value *matchCtpopW(Instruction &In, unsigned BW);
+  Value *optimizeToCtpop(Instruction &In);
+  Value *optimizeToCtlz(Instruction &In);
+  bool runOnBlock(BasicBlock &B);
+
+  const TargetLibraryInfo &TLI;
+  const DataLayout &DL;
+};
+
 } // end namespace llvm.
Index: lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp
@@ -0,0 +1,296 @@
+//===- BitCountCombine.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This code looks for code calculates the population count and the leading
+// zeros count using the bit-manipulation methods from Hacker's Delight.
+//
+// ctpop(n):
+//   bw = bitwidth(n)
+//   n = (n & 0x01010101..0101) + (n & 0x10101010..1010 >> 1)
+//   n = (n & 0x00110011..0011) + (n & 0x11001100..1100 >> 2)
+//   ...
+//   n = (n & 0x000000..111111) + (n & 0x111111..000000 >> bw/2)
+//   return n
+//
+// ctlz(n):
+//   bw = bitwidth(n)
+//   n = n | (n >> 1)
+//   n = n | (n >> 2)
+//   n = n | (n >> 4)
+//   ...
+//   n = n | (n >> bw/2)
+//   return bw - ctpop(n)
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveInstCombineInternal.h"
+#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+// Check if In matches a ctpop calculation pattern for a value of width BW
+// bits. If so, return the argument V such that ctpop(V) would be a candidate
+// for replacing In.
+// The matched pattern is:
+//   x0 := (V & 0x55..5) + ((V>>1) & 0x55..5)
+//   x1 := (x0 & 0x33..3) + ((x0>>2) & 0x33..3)
+//   ...
+//   xn := (xn-1 & 0x00..0FF..F) + ((xn-1>>S/2) & 0x00..0FF..F)
+// where xn is the candidate for ctpop(V).
+Value *BitCountCombine::matchCtpopW(Instruction &In, unsigned BW) {
+  auto matchStep = [] (Value *V, unsigned S, const APInt &M, bool ShiftAlone)
+        -> Value* {
+    Value *Op0 = nullptr, *Op1 = nullptr;
+    if (!match(V, m_Add(m_Value(Op0), m_Value(Op1))))
+      return nullptr;
+
+    auto matchAndShift = [S,&M,ShiftAlone] (Value *V0, Value *V1) -> Value* {
+      Value *V = nullptr;
+      const APInt *P;
+      auto Mask = m_APInt(P);
+      auto Shift = m_SpecificInt(S);
+
+      if (!match(V0, m_And(m_Value(V), Mask)))
+        return nullptr;
+      if (ShiftAlone) {
+        if (!match(V1, m_LShr(m_Specific(V), Shift)))
+          return nullptr;
+      } else {
+        if (!match(V1, m_And(m_LShr(m_Specific(V), Shift), Mask)))
+          return nullptr;
+      }
+      if (!APInt::isSameValue(M, *P))
+        return nullptr;
+      return V;
+    };
+
+    if (Value *T = matchAndShift(Op0, Op1))
+      return T;
+    if (Value *T = matchAndShift(Op1, Op0))
+      return T;
+    return nullptr;
+  };
+
+  // Generate the bitmask for the & operation. BW is the bit-width of the
+  // entire mask. The masks are:
+  //   0b01010101..01010101     0x55..55      1 bit  every 2 bits
+  //   0b00110011..00110011     0x33..35      2 bits every 4 bits
+  //   0b00000111..00000111     0x07..07      3 bits every 8 bits
+  //   ...                      ...        logS bits every S bits
+  // Normally the masks would be 01010101, 00110011, 00001111, i.e. the
+  // number of contiguous 1 bits in each group would be twice the number
+  // in the previous mask, but by the time this code runs, the "demanded"
+  // bits have been optimized to only require one more 1 bit in each
+  // subsequent mask. This function generates the post-optimized masks.
+  auto getMask = [] (unsigned S, unsigned BW) -> APInt {
+    assert(isPowerOf2_32(S));
+    APInt M(S, S-1);
+    return APInt::getSplat(BW, M);
+  };
+
+  Value *V = &In;
+  bool ShiftAlone = true;
+  unsigned N = BW;
+  while (N > 1) {
+    unsigned S = N/2;
+    V = matchStep(V, S, getMask(N, BW), ShiftAlone);
+    if (!V)
+      return nullptr;
+    N = S;
+    ShiftAlone = false;
+  }
+
+  return V;
+}
+
+// If In is an expression that evaluates popcnt via shift/add pattern,
+// return the equivalent expression using the ctpop intrinsic. Otherwise
+// return nullptr.
+Value *BitCountCombine::optimizeToCtpop(Instruction &In) {
+  IntegerType *Ty = dyn_cast<IntegerType>(In.getType());
+  if (!Ty)
+    return nullptr;
+
+  // Take the first shift amount feeding the add, and assume this is the
+  // last shift in the popcnt computation.
+  Value *Op0 = nullptr, *Op1 = nullptr;
+  if (!match(&In, m_Add(m_Value(Op0), m_Value(Op1))))
+    return nullptr;
+
+  // Shift by half-width.
+  uint64_t SH = 0;
+  if (!match(Op0, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) &&
+      !match(Op1, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) &&
+      !match(Op0, m_LShr(m_Value(), m_ConstantInt(SH))) &&
+      !match(Op1, m_LShr(m_Value(), m_ConstantInt(SH))))
+    return nullptr;
+
+  if (SH < 4 || !isPowerOf2_64(SH))
+    return nullptr;
+
+  Value *V = matchCtpopW(In, 2*SH);
+  if (!V)
+    return nullptr;
+
+  unsigned TW = Ty->getBitWidth(), BW = 2*SH;
+  if (BW < TW) {
+    // BW is the bit width of the expression whose population count is
+    // being calculated. TW is the bit width of the type associated with
+    // that expression. Usually they are the same, but for ctpop8 the
+    // type may be "unsigned", i.e. 32-bit, while the ctpop8 would only
+    // consider the low 8 bits. In that case BW=8 and TW=32.
+    KnownBits K(TW);
+    computeKnownBits(V, K, DL);
+    APInt Need0 = APInt::getBitsSet(TW, BW, TW);
+    if ((K.Zero & Need0) != Need0)
+      return nullptr;
+  }
+
+  IRBuilder<> Builder(&In);
+  Module *M = In.getParent()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, {V->getType()});
+  CallInst *CI = Builder.CreateCall(Func, {V});
+  CI->setDebugLoc(In.getDebugLoc());
+  return CI;
+}
+
+Value *BitCountCombine::optimizeToCtlz(Instruction &In) {
+  // Let bw = bitwidth(n),
+  // convert
+  //   n = n | (n>>1)
+  //   n = n | (n>>2)
+  //   n = n | (n>>4)
+  //   ...
+  //   n = n | (n>>bw/2)
+  //   bw - ctpop(n)
+  // to
+  //   ctlz(n).
+  // This code expects that the ctpop intrinsic has already been generated.
+
+  // Skip the check for the subtract, since it may have been folded into
+  // another computation. Start the checks at the ctpop intrinsic.
+  if (!match(&In, m_Intrinsic<Intrinsic::ctpop>()))
+    return nullptr;
+
+  // Get the argument of the ctpop.
+  Value *V = In.getOperand(0);
+
+  // The argument to ctpop can be zero-extended in some cases. It is safe
+  // to ignore the zext.
+  if (auto *Z = dyn_cast<ZExtInst>(V))
+    V = Z->getOperand(0);
+
+  // A pre-existing ctpop intrinsic can return a vector.
+  IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
+  if (!Ty)
+    return nullptr;
+
+  KnownBits K(Ty->getBitWidth());
+  computeKnownBits(V, K, DL);
+  unsigned BW = Ty->getBitWidth() - K.One.countLeadingOnes();
+  if (!isPowerOf2_32(BW))
+    BW = NextPowerOf2(BW);
+
+  auto matchOrShift = [] (Value *V, unsigned S) -> Value* {
+    Value *Op0 = nullptr, *Op1 = nullptr;
+    if (!match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+      return nullptr;
+    if (match(Op0, m_LShr(m_Specific(Op1), m_SpecificInt(S))))
+      return Op1;
+    if (match(Op1, m_LShr(m_Specific(Op0), m_SpecificInt(S))))
+      return Op0;
+    return nullptr;
+  };
+
+  // The bitwidth of the input expression will be detected iteratively
+  // by trying the initial match with increasing widths. The bitwidth
+  // cannot be shorter than the expression whose ctpop was calculated,
+  // since the leading zero calculation would include bits not accounted
+  // for by the algorithm. Limit the bitwidth search to 64 bits.
+  while (!matchOrShift(V, BW/2) && BW <= 64)
+    BW *= 2;
+  if (BW > 64)
+    return nullptr;
+
+  unsigned N = BW;
+  while (N > 1) {
+    N /= 2;
+    V = matchOrShift(V, N);
+    if (!V)
+      return nullptr;
+  }
+
+  // The value of BW is the one that determines the type of ctlz's argument.
+  IRBuilder<> Builder(&In);
+  if (BW > Ty->getBitWidth()) {
+    IntegerType *ATy = IntegerType::get(In.getContext(), BW);
+    V = Builder.CreateZExt(V, ATy);
+  }
+  Module *M = In.getParent()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, {V->getType()});
+  Value *False = ConstantInt::getFalse(In.getContext());
+  Instruction *CI = Builder.CreateCall(Func, {V, False});
+  CI->setDebugLoc(In.getDebugLoc());
+  Value *Ext = In.getType() != CI->getType()
+                  ? Builder.CreateZExt(CI, In.getType())
+                  : CI;
+
+  // The actual ctlz(n) is "bw - ctpop", which is what "Ext" is here. We've
+  // ignored the "bw - ..." part, however, so fix it up before returning.
+  return Builder.CreateSub(ConstantInt::get(In.getType(), BW), Ext, "",
+                           true, true);
+}
+
+bool BitCountCombine::runOnBlock(BasicBlock &B) {
+  bool Changed = false;
+
+  // Iterate over the block as long as there are more intrinsics generated.
+  while (true) {
+    Value *Int = nullptr;
+    for (Instruction &In : reverse(B)) {
+      Int = optimizeToCtpop(In);
+      if (!Int)
+        Int = optimizeToCtlz(In);
+      if (Int) {
+        Changed = true;
+        In.replaceAllUsesWith(Int);
+        RecursivelyDeleteTriviallyDeadInstructions(&In, &TLI);
+        break;
+      }
+    }
+    if (!Int)
+      break;
+  }
+
+  return Changed;
+}
+
+bool BitCountCombine::run(Function &F) {
+  // Avoid optimizing compiler-rt functions that do the same thing.
+  // If the intrinsics expand to calls to compiler-rt, this could
+  // cause an infinite loop.
+  StringRef N = F.getName();
+  // Check for the name __popcount.i2:
+  if (N.size() == 13 && N.startswith("__popcount") && N.endswith("i2"))
+    return false;
+  // Check for the name __clz.i2:
+  if (N.size() == 8 && N.startswith("__clz") && N.endswith("i2"))
+    return false;
+
+  bool Changed = false;
+  for (BasicBlock &B : F)
+    Changed |= runOnBlock(B);
+
+  return Changed;
+}
Index: lib/Transforms/AggressiveInstCombine/CMakeLists.txt
===================================================================
--- lib/Transforms/AggressiveInstCombine/CMakeLists.txt
+++ lib/Transforms/AggressiveInstCombine/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMAggressiveInstCombine
   AggressiveInstCombine.cpp
+  BitCountCombine.cpp
   TruncInstCombine.cpp
 
   ADDITIONAL_HEADER_DIRS
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -319,8 +319,8 @@
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   // Combine silly seq's
-  if (OptLevel > 2)
-    MPM.add(createAggressiveInstCombinerPass());
+//  if (OptLevel > 2)
+//    MPM.add(createAggressiveInstCombinerPass());
   addInstructionCombiningPass(MPM);
   if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
     MPM.add(createLibCallsShrinkWrapPass());
@@ -470,6 +470,8 @@
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
+    MPM.add(createAggressiveInstCombinerPass());
+  if (OptLevel > 2)
     MPM.add(createCallSiteSplittingPass());
 
   MPM.add(createIPSCCPPass());          // IP SCCP
Index: test/Transforms/AggressiveInstCombine/ctlz-combine.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/ctlz-combine.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s
+
+; unsigned ctlz16(unsigned short t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555);
+;   unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333);
+;   unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F);
+;   unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF);
+;   return 16-t4;
+; }
+;
+define i32 @ctlz16(i16 zeroext %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: @ctlz16(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[A0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nuw nsw i32 16, [[TMP1]]
+; CHECK-NEXT:    [[V24:%.*]] = sub nsw i32 16, [[TMP2]]
+; CHECK-NEXT:    ret i32 [[V24]]
+;
+b0:
+  %v0 = lshr i16 %a0, 1
+  %v1 = or i16 %v0, %a0
+  %v2 = lshr i16 %v1, 2
+  %v3 = or i16 %v2, %v1
+  %v4 = lshr i16 %v3, 4
+  %v5 = or i16 %v4, %v3
+  %v6 = lshr i16 %v5, 8
+  %v7 = or i16 %v6, %v5
+  %v8 = zext i16 %v7 to i32
+  %v9 = and i32 %v8, 21845
+  %v10 = lshr i32 %v8, 1
+  %v11 = and i32 %v10, 21845
+  %v12 = add nuw nsw i32 %v9, %v11
+  %v13 = and i32 %v12, 13107
+  %v14 = lshr i32 %v12, 2
+  %v15 = and i32 %v14, 13107
+  %v16 = add nuw nsw i32 %v13, %v15
+  %v17 = and i32 %v16, 1799
+  %v18 = lshr i32 %v16, 4
+  %v19 = and i32 %v18, 1799
+  %v20 = add nuw nsw i32 %v17, %v19
+  %v21 = and i32 %v20, 15
+  %v22 = lshr i32 %v20, 8
+  %v23 = add nuw nsw i32 %v21, %v22
+  %v24 = sub nsw i32 16, %v23
+  ret i32 %v24
+}
+
+; unsigned ctlz32(unsigned t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   t0 = t0 | (t0>>16);
+;   unsigned t1 = (t0 & 0x55555555) + ((t0>>1)  & 0x55555555);
+;   unsigned t2 = (t1 & 0x33333333) + ((t1>>2)  & 0x33333333);
+;   unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4)  & 0x0F0F0F0F);
+;   unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8)  & 0x00FF00FF);
+;   unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF);
+;   return 32-t5;
+; }
+;
+define i32 @ctlz32(i32 %a0) local_unnamed_addr #1 {
+; CHECK-LABEL: @ctlz32(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[A0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i32 32, [[TMP0]]
+; CHECK-NEXT:    [[V29:%.*]] = sub nsw i32 32, [[TMP1]]
+; CHECK-NEXT:    ret i32 [[V29]]
+;
+b0:
+  %v0 = lshr i32 %a0, 1
+  %v1 = or i32 %v0, %a0
+  %v2 = lshr i32 %v1, 2
+  %v3 = or i32 %v1, %v2
+  %v4 = lshr i32 %v3, 4
+  %v5 = or i32 %v3, %v4
+  %v6 = lshr i32 %v5, 8
+  %v7 = or i32 %v5, %v6
+  %v8 = lshr i32 %v7, 16
+  %v9 = or i32 %v7, %v8
+  %v10 = and i32 %v9, 1431655765
+  %v11 = lshr i32 %v9, 1
+  %v12 = and i32 %v11, 1431655765
+  %v13 = add nuw i32 %v10, %v12
+  %v14 = and i32 %v13, 858993459
+  %v15 = lshr i32 %v13, 2
+  %v16 = and i32 %v15, 858993459
+  %v17 = add nuw nsw i32 %v14, %v16
+  %v18 = and i32 %v17, 117901063
+  %v19 = lshr i32 %v17, 4
+  %v20 = and i32 %v19, 117901063
+  %v21 = add nuw nsw i32 %v18, %v20
+  %v22 = and i32 %v21, 983055
+  %v23 = lshr i32 %v21, 8
+  %v24 = and i32 %v23, 983055
+  %v25 = add nuw nsw i32 %v22, %v24
+  %v26 = and i32 %v25, 31
+  %v27 = lshr i32 %v25, 16
+  %v28 = add nuw nsw i32 %v26, %v27
+  %v29 = sub nsw i32 32, %v28
+  ret i32 %v29
+}
+
+; typedef unsigned long long u64_t;
+; u64_t ctlz64(u64_t t0) {
+;   t0 = t0 | (t0>>1);
+;   t0 = t0 | (t0>>2);
+;   t0 = t0 | (t0>>4);
+;   t0 = t0 | (t0>>8);
+;   t0 = t0 | (t0>>16);
+;   t0 = t0 | (t0>>32);
+;   u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1)  & 0x5555555555555555LL);
+;   u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2)  & 0x3333333333333333LL);
+;   u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4)  & 0x0F0F0F0F0F0F0F0FLL);
+;   u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8)  & 0x00FF00FF00FF00FFLL);
+;   u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL);
+;   u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL);
+;   return 64-t6;
+; }
+;
+define i64 @ctlz64(i64 %a0) local_unnamed_addr #1 {
+; CHECK-LABEL: @ctlz64(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i64 64, [[TMP0]]
+; CHECK-NEXT:    [[V35:%.*]] = sub nsw i64 64, [[TMP1]]
+; CHECK-NEXT:    ret i64 [[V35]]
+;
+b0:
+  %v0 = lshr i64 %a0, 1
+  %v1 = or i64 %v0, %a0
+  %v2 = lshr i64 %v1, 2
+  %v3 = or i64 %v1, %v2
+  %v4 = lshr i64 %v3, 4
+  %v5 = or i64 %v3, %v4
+  %v6 = lshr i64 %v5, 8
+  %v7 = or i64 %v5, %v6
+  %v8 = lshr i64 %v7, 16
+  %v9 = or i64 %v7, %v8
+  %v10 = lshr i64 %v9, 32
+  %v11 = or i64 %v9, %v10
+  %v12 = and i64 %v11, 6148914691236517205
+  %v13 = lshr i64 %v11, 1
+  %v14 = and i64 %v13, 6148914691236517205
+  %v15 = add nuw i64 %v12, %v14
+  %v16 = and i64 %v15, 3689348814741910323
+  %v17 = lshr i64 %v15, 2
+  %v18 = and i64 %v17, 3689348814741910323
+  %v19 = add nuw nsw i64 %v16, %v18
+  %v20 = and i64 %v19, 506381209866536711
+  %v21 = lshr i64 %v19, 4
+  %v22 = and i64 %v21, 506381209866536711
+  %v23 = add nuw nsw i64 %v20, %v22
+  %v24 = and i64 %v23, 4222189076152335
+  %v25 = lshr i64 %v23, 8
+  %v26 = and i64 %v25, 4222189076152335
+  %v27 = add nuw nsw i64 %v24, %v26
+  %v28 = and i64 %v27, 133143986207
+  %v29 = lshr i64 %v27, 16
+  %v30 = and i64 %v29, 133143986207
+  %v31 = add nuw nsw i64 %v28, %v30
+  %v32 = and i64 %v31, 63
+  %v33 = lshr i64 %v31, 32
+  %v34 = add nuw nsw i64 %v32, %v33
+  %v35 = sub nsw i64 64, %v34
+  ret i64 %v35
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable }
+attributes #1 = { nounwind uwtable }
Index: test/Transforms/AggressiveInstCombine/ctlz-rt.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/ctlz-rt.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s
+
+; Make sure that the intrinsic is not generated for compiler-rt functions.
+
+define i32 @__clzsi2(i32 %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: @__clzsi2(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = lshr i32 [[A0:%.*]], 1
+; CHECK-NEXT:    [[V1:%.*]] = or i32 [[V0]], [[A0]]
+; CHECK-NEXT:    [[V2:%.*]] = lshr i32 [[V1]], 2
+; CHECK-NEXT:    [[V3:%.*]] = or i32 [[V1]], [[V2]]
+; CHECK-NEXT:    [[V4:%.*]] = lshr i32 [[V3]], 4
+; CHECK-NEXT:    [[V5:%.*]] = or i32 [[V3]], [[V4]]
+; CHECK-NEXT:    [[V6:%.*]] = lshr i32 [[V5]], 8
+; CHECK-NEXT:    [[V7:%.*]] = or i32 [[V5]], [[V6]]
+; CHECK-NEXT:    [[V8:%.*]] = lshr i32 [[V7]], 16
+; CHECK-NEXT:    [[V9:%.*]] = or i32 [[V7]], [[V8]]
+; CHECK-NEXT:    [[V10:%.*]] = and i32 [[V9]], 1431655765
+; CHECK-NEXT:    [[V11:%.*]] = lshr i32 [[V9]], 1
+; CHECK-NEXT:    [[V12:%.*]] = and i32 [[V11]], 1431655765
+; CHECK-NEXT:    [[V13:%.*]] = add nuw i32 [[V10]], [[V12]]
+; CHECK-NEXT:    [[V14:%.*]] = and i32 [[V13]], 858993459
+; CHECK-NEXT:    [[V15:%.*]] = lshr i32 [[V13]], 2
+; CHECK-NEXT:    [[V16:%.*]] = and i32 [[V15]], 858993459
+; CHECK-NEXT:    [[V17:%.*]] = add nuw nsw i32 [[V14]], [[V16]]
+; CHECK-NEXT:    [[V18:%.*]] = and i32 [[V17]], 117901063
+; CHECK-NEXT:    [[V19:%.*]] = lshr i32 [[V17]], 4
+; CHECK-NEXT:    [[V20:%.*]] = and i32 [[V19]], 117901063
+; CHECK-NEXT:    [[V21:%.*]] = add nuw nsw i32 [[V18]], [[V20]]
+; CHECK-NEXT:    [[V22:%.*]] = and i32 [[V21]], 983055
+; CHECK-NEXT:    [[V23:%.*]] = lshr i32 [[V21]], 8
+; CHECK-NEXT:    [[V24:%.*]] = and i32 [[V23]], 983055
+; CHECK-NEXT:    [[V25:%.*]] = add nuw nsw i32 [[V22]], [[V24]]
+; CHECK-NEXT:    [[V26:%.*]] = and i32 [[V25]], 31
+; CHECK-NEXT:    [[V27:%.*]] = lshr i32 [[V25]], 16
+; CHECK-NEXT:    [[V28:%.*]] = add nuw nsw i32 [[V26]], [[V27]]
+; CHECK-NEXT:    [[V29:%.*]] = sub nsw i32 32, [[V28]]
+; CHECK-NEXT:    ret i32 [[V29]]
+;
+b0:
+  %v0 = lshr i32 %a0, 1
+  %v1 = or i32 %v0, %a0
+  %v2 = lshr i32 %v1, 2
+  %v3 = or i32 %v1, %v2
+  %v4 = lshr i32 %v3, 4
+  %v5 = or i32 %v3, %v4
+  %v6 = lshr i32 %v5, 8
+  %v7 = or i32 %v5, %v6
+  %v8 = lshr i32 %v7, 16
+  %v9 = or i32 %v7, %v8
+  %v10 = and i32 %v9, 1431655765
+  %v11 = lshr i32 %v9, 1
+  %v12 = and i32 %v11, 1431655765
+  %v13 = add nuw i32 %v10, %v12
+  %v14 = and i32 %v13, 858993459
+  %v15 = lshr i32 %v13, 2
+  %v16 = and i32 %v15, 858993459
+  %v17 = add nuw nsw i32 %v14, %v16
+  %v18 = and i32 %v17, 117901063
+  %v19 = lshr i32 %v17, 4
+  %v20 = and i32 %v19, 117901063
+  %v21 = add nuw nsw i32 %v18, %v20
+  %v22 = and i32 %v21, 983055
+  %v23 = lshr i32 %v21, 8
+  %v24 = and i32 %v23, 983055
+  %v25 = add nuw nsw i32 %v22, %v24
+  %v26 = and i32 %v25, 31
+  %v27 = lshr i32 %v25, 16
+  %v28 = add nuw nsw i32 %v26, %v27
+  %v29 = sub nsw i32 32, %v28
+  ret i32 %v29
+}
+
+attributes #0 = { norecurse nounwind readnone }
Index: test/Transforms/AggressiveInstCombine/ctpop-combine.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/ctpop-combine.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s
+
+; unsigned pop8(unsigned char t0) {
+;   unsigned t1 = (t0 & 0x55) + ((t0>>1) & 0x55);
+;   unsigned t2 = (t1 & 0x33) + ((t1>>2) & 0x33);
+;   unsigned t3 = (t2 & 0x0F) + ((t2>>4) & 0x0F);
+;   return t3;
+; }
+;
+define i32 @pop8(i8 zeroext %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: @pop8(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = zext i8 [[A0:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[V0]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+b0:
+  %v0 = zext i8 %a0 to i32
+  %v1 = and i32 %v0, 85
+  %v2 = lshr i32 %v0, 1
+  %v3 = and i32 %v2, 85
+  %v4 = add nuw nsw i32 %v1, %v3
+  %v5 = and i32 %v4, 51
+  %v6 = lshr i32 %v4, 2
+  %v7 = and i32 %v6, 51
+  %v8 = add nuw nsw i32 %v5, %v7
+  %v9 = and i32 %v8, 7
+  %v10 = lshr i32 %v8, 4
+  %v11 = add nuw nsw i32 %v9, %v10
+  ret i32 %v11
+}
+
+; unsigned pop16(unsigned short t0) {
+;   unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555);
+;   unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333);
+;   unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F);
+;   unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF);
+;   return t4;
+; }
+;
+define i32 @pop16(i16 zeroext %a0) local_unnamed_addr #1 {
+; CHECK-LABEL: @pop16(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = zext i16 [[A0:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[V0]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+b0:
+  %v0 = zext i16 %a0 to i32
+  %v1 = and i32 %v0, 21845
+  %v2 = lshr i32 %v0, 1
+  %v3 = and i32 %v2, 21845
+  %v4 = add nuw nsw i32 %v1, %v3
+  %v5 = and i32 %v4, 13107
+  %v6 = lshr i32 %v4, 2
+  %v7 = and i32 %v6, 13107
+  %v8 = add nuw nsw i32 %v5, %v7
+  %v9 = and i32 %v8, 1799
+  %v10 = lshr i32 %v8, 4
+  %v11 = and i32 %v10, 1799
+  %v12 = add nuw nsw i32 %v9, %v11
+  %v13 = and i32 %v12, 15
+  %v14 = lshr i32 %v12, 8
+  %v15 = add nuw nsw i32 %v13, %v14
+  ret i32 %v15
+}
+
+; unsigned pop32(unsigned t0) {
+;   unsigned t1 = (t0 & 0x55555555) + ((t0>>1)  & 0x55555555);
+;   unsigned t2 = (t1 & 0x33333333) + ((t1>>2)  & 0x33333333);
+;   unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4)  & 0x0F0F0F0F);
+;   unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8)  & 0x00FF00FF);
+;   unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF);
+;   return t5;
+; }
+;
+define i32 @pop32(i32 %a0) local_unnamed_addr #1 {
+; CHECK-LABEL: @pop32(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A0:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+b0:
+  %v0 = and i32 %a0, 1431655765
+  %v1 = lshr i32 %a0, 1
+  %v2 = and i32 %v1, 1431655765
+  %v3 = add nuw i32 %v0, %v2
+  %v4 = and i32 %v3, 858993459
+  %v5 = lshr i32 %v3, 2
+  %v6 = and i32 %v5, 858993459
+  %v7 = add nuw nsw i32 %v4, %v6
+  %v8 = and i32 %v7, 117901063
+  %v9 = lshr i32 %v7, 4
+  %v10 = and i32 %v9, 117901063
+  %v11 = add nuw nsw i32 %v8, %v10
+  %v12 = and i32 %v11, 983055
+  %v13 = lshr i32 %v11, 8
+  %v14 = and i32 %v13, 983055
+  %v15 = add nuw nsw i32 %v12, %v14
+  %v16 = and i32 %v15, 31
+  %v17 = lshr i32 %v15, 16
+  %v18 = add nuw nsw i32 %v16, %v17
+  ret i32 %v18
+}
+
+; typedef unsigned long long u64_t;
+; u64_t pop64(u64_t t0) {
+;   u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1)  & 0x5555555555555555LL);
+;   u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2)  & 0x3333333333333333LL);
+;   u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4)  & 0x0F0F0F0F0F0F0F0FLL);
+;   u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8)  & 0x00FF00FF00FF00FFLL);
+;   u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL);
+;   u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL);
+;   return t6;
+; }
+;
+define i64 @pop64(i64 %a0) local_unnamed_addr #1 {
+; CHECK-LABEL: @pop64(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[A0:%.*]])
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+b0:
+  %v0 = and i64 %a0, 6148914691236517205
+  %v1 = lshr i64 %a0, 1
+  %v2 = and i64 %v1, 6148914691236517205
+  %v3 = add nuw i64 %v0, %v2
+  %v4 = and i64 %v3, 3689348814741910323
+  %v5 = lshr i64 %v3, 2
+  %v6 = and i64 %v5, 3689348814741910323
+  %v7 = add nuw nsw i64 %v4, %v6
+  %v8 = and i64 %v7, 506381209866536711
+  %v9 = lshr i64 %v7, 4
+  %v10 = and i64 %v9, 506381209866536711
+  %v11 = add nuw nsw i64 %v8, %v10
+  %v12 = and i64 %v11, 4222189076152335
+  %v13 = lshr i64 %v11, 8
+  %v14 = and i64 %v13, 4222189076152335
+  %v15 = add nuw nsw i64 %v12, %v14
+  %v16 = and i64 %v15, 133143986207
+  %v17 = lshr i64 %v15, 16
+  %v18 = and i64 %v17, 133143986207
+  %v19 = add nuw nsw i64 %v16, %v18
+  %v20 = and i64 %v19, 63
+  %v21 = lshr i64 %v19, 32
+  %v22 = add nuw nsw i64 %v20, %v21
+  ret i64 %v22
+}
+
+; Negative test: one of the mask values is incorrect for ctpop.
+define i32 @not_pop32(i32 %a0) local_unnamed_addr #1 {
+; CHECK-LABEL: @not_pop32(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = and i32 [[A0:%.*]], 1431655765
+; CHECK-NEXT:    [[V1:%.*]] = lshr i32 [[A0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = and i32 [[V1]], 1431655765
+; CHECK-NEXT:    [[V3:%.*]] = add nuw i32 [[V0]], [[V2]]
+; CHECK-NEXT:    [[V4:%.*]] = and i32 [[V3]], 858993459
+; CHECK-NEXT:    [[V5:%.*]] = lshr i32 [[V3]], 2
+; CHECK-NEXT:    [[V6:%.*]] = and i32 [[V5]], 858993459
+; CHECK-NEXT:    [[V7:%.*]] = add nuw nsw i32 [[V4]], [[V6]]
+; CHECK-NEXT:    [[V8:%.*]] = and i32 [[V7]], 117902063
+; CHECK-NEXT:    [[V9:%.*]] = lshr i32 [[V7]], 4
+; CHECK-NEXT:    [[V10:%.*]] = and i32 [[V9]], 117902063
+; CHECK-NEXT:    [[V11:%.*]] = add nuw nsw i32 [[V8]], [[V10]]
+; CHECK-NEXT:    [[V12:%.*]] = and i32 [[V11]], 983055
+; CHECK-NEXT:    [[V13:%.*]] = lshr i32 [[V11]], 8
+; CHECK-NEXT:    [[V14:%.*]] = and i32 [[V13]], 983055
+; CHECK-NEXT:    [[V15:%.*]] = add nuw nsw i32 [[V12]], [[V14]]
+; CHECK-NEXT:    [[V16:%.*]] = and i32 [[V15]], 31
+; CHECK-NEXT:    [[V17:%.*]] = lshr i32 [[V15]], 16
+; CHECK-NEXT:    [[V18:%.*]] = add nuw nsw i32 [[V16]], [[V17]]
+; CHECK-NEXT:    ret i32 [[V18]]
+;
+b0:
+  %v0 = and i32 %a0, 1431655765
+  %v1 = lshr i32 %a0, 1
+  %v2 = and i32 %v1, 1431655765
+  %v3 = add nuw i32 %v0, %v2
+  %v4 = and i32 %v3, 858993459
+  %v5 = lshr i32 %v3, 2
+  %v6 = and i32 %v5, 858993459
+  %v7 = add nuw nsw i32 %v4, %v6
+  %v8 = and i32 %v7, 117902063
+  %v9 = lshr i32 %v7, 4
+  %v10 = and i32 %v9, 117902063
+  %v11 = add nuw nsw i32 %v8, %v10
+  %v12 = and i32 %v11, 983055
+  %v13 = lshr i32 %v11, 8
+  %v14 = and i32 %v13, 983055
+  %v15 = add nuw nsw i32 %v12, %v14
+  %v16 = and i32 %v15, 31
+  %v17 = lshr i32 %v15, 16
+  %v18 = add nuw nsw i32 %v16, %v17
+  ret i32 %v18
+}
+
+
+attributes #0 = { norecurse nounwind readnone }
+attributes #1 = { nounwind uwtable }
Index: test/Transforms/AggressiveInstCombine/ctpop-rt.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/ctpop-rt.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s
+
+; Make sure that the intrinsic is not generated for compiler-rt functions.
+
+define i32 @__popcountsi2(i32 %a0) local_unnamed_addr #0 {
+; CHECK-LABEL: @__popcountsi2(
+; CHECK-NEXT:  b0:
+; CHECK-NEXT:    [[V0:%.*]] = and i32 [[A0:%.*]], 1431655765
+; CHECK-NEXT:    [[V1:%.*]] = lshr i32 [[A0]], 1
+; CHECK-NEXT:    [[V2:%.*]] = and i32 [[V1]], 1431655765
+; CHECK-NEXT:    [[V3:%.*]] = add nuw i32 [[V0]], [[V2]]
+; CHECK-NEXT:    [[V4:%.*]] = and i32 [[V3]], 858993459
+; CHECK-NEXT:    [[V5:%.*]] = lshr i32 [[V3]], 2
+; CHECK-NEXT:    [[V6:%.*]] = and i32 [[V5]], 858993459
+; CHECK-NEXT:    [[V7:%.*]] = add nuw nsw i32 [[V4]], [[V6]]
+; CHECK-NEXT:    [[V8:%.*]] = and i32 [[V7]], 117901063
+; CHECK-NEXT:    [[V9:%.*]] = lshr i32 [[V7]], 4
+; CHECK-NEXT:    [[V10:%.*]] = and i32 [[V9]], 117901063
+; CHECK-NEXT:    [[V11:%.*]] = add nuw nsw i32 [[V8]], [[V10]]
+; CHECK-NEXT:    [[V12:%.*]] = and i32 [[V11]], 983055
+; CHECK-NEXT:    [[V13:%.*]] = lshr i32 [[V11]], 8
+; CHECK-NEXT:    [[V14:%.*]] = and i32 [[V13]], 983055
+; CHECK-NEXT:    [[V15:%.*]] = add nuw nsw i32 [[V12]], [[V14]]
+; CHECK-NEXT:    [[V16:%.*]] = and i32 [[V15]], 31
+; CHECK-NEXT:    [[V17:%.*]] = lshr i32 [[V15]], 16
+; CHECK-NEXT:    [[V18:%.*]] = add nuw nsw i32 [[V16]], [[V17]]
+; CHECK-NEXT:    ret i32 [[V18]]
+;
+b0:
+  %v0 = and i32 %a0, 1431655765
+  %v1 = lshr i32 %a0, 1
+  %v2 = and i32 %v1, 1431655765
+  %v3 = add nuw i32 %v0, %v2
+  %v4 = and i32 %v3, 858993459
+  %v5 = lshr i32 %v3, 2
+  %v6 = and i32 %v5, 858993459
+  %v7 = add nuw nsw i32 %v4, %v6
+  %v8 = and i32 %v7, 117901063
+  %v9 = lshr i32 %v7, 4
+  %v10 = and i32 %v9, 117901063
+  %v11 = add nuw nsw i32 %v8, %v10
+  %v12 = and i32 %v11, 983055
+  %v13 = lshr i32 %v11, 8
+  %v14 = and i32 %v13, 983055
+  %v15 = add nuw nsw i32 %v12, %v14
+  %v16 = and i32 %v15, 31
+  %v17 = lshr i32 %v15, 16
+  %v18 = add nuw nsw i32 %v16, %v17
+  ret i32 %v18
+}
+
+attributes #0 = { norecurse nounwind readnone }