Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -594,6 +594,10 @@
   /// considered as "Slow".
   enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware };
 
+  /// Return true if the target would benefit from recognizing a table-based
+  /// implementation of cttz.
+  bool preferCTTZLowering() const;
+
   /// Return true if the specified immediate is legal add immediate, that
   /// is the target has add instructions which can add a register with the
   /// immediate without having to materialize the immediate into a register.
@@ -1530,6 +1534,7 @@
       APInt &UndefElts2, APInt &UndefElts3,
       std::function<void(Instruction *, unsigned, APInt, APInt &)>
           SimplifyAndSetOp) = 0;
+  virtual bool preferCTTZLowering() = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1908,6 +1913,9 @@
         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
         SimplifyAndSetOp);
   }
+  bool preferCTTZLowering() override {
+    return Impl.preferCTTZLowering();
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -200,6 +200,8 @@
   void getPeelingPreferences(Loop *, ScalarEvolution &,
                              TTI::PeelingPreferences &) const {}
 
+  bool preferCTTZLowering() const { return false; }
+
   bool isLegalAddImmediate(int64_t Imm) const { return false; }
 
   bool isLegalICmpImmediate(int64_t Imm) const { return false; }
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -293,6 +293,10 @@
     return nullptr;
   }
 
+  bool preferCTTZLowering() {
+    return getTLI()->preferCTTZLowering();
+  }
+
   bool isLegalAddImmediate(int64_t imm) {
     return getTLI()->isLegalAddImmediate(imm);
   }
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2438,6 +2438,12 @@
     return true;
   }
 
+  /// Return true if the target would benefit from recognizing a table-based
+  /// implementation of cttz.
+  virtual bool preferCTTZLowering() const {
+    return false;
+  }
+
   /// Return true if the specified immediate is legal add immediate, that is the
   /// target has add instructions which can add a register with the immediate
   /// without having to materialize the immediate into a register.
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -333,6 +333,10 @@
   return TTIImpl->getPeelingPreferences(L, SE, PP);
 }
 
+bool TargetTransformInfo::preferCTTZLowering() const {
+  return TTIImpl->preferCTTZLowering();
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return TTIImpl->isLegalAddImmediate(Imm);
 }
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -607,6 +607,7 @@
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
+  bool preferCTTZLowering() const override;
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12830,6 +12830,10 @@
   return LLT();
 }
 
+bool AArch64TargetLowering::preferCTTZLowering() const {
+  return true;
+}
+
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   if (Immed == std::numeric_limits<int64_t>::min()) {
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2042,9 +2042,9 @@
 def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
 
 def : Pat<(cttz GPR32:$Rn),
-          (CLZWr (RBITWr GPR32:$Rn))>;
+          (ANDWri (CLZWr (RBITWr GPR32:$Rn)), (i32 4))>;
 def : Pat<(cttz GPR64:$Rn),
-          (CLZXr (RBITXr GPR64:$Rn))>;
+          (ANDXri (CLZXr (RBITXr GPR64:$Rn)), (i64 5))>;
 def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
                 (i32 1))),
           (CLSWr GPR32:$Rn)>;
Index: llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
===================================================================
--- llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -363,10 +364,157 @@
   return false;
 }
 
+static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
+                        uint64_t Shift, uint64_t InputBits) {
+  unsigned Length = Table.getNumElements();
+  uint64_t IntWidth = InputBits / CHAR_BIT;
+  if (Length < InputBits || Length > InputBits * 2)
+    return false;
+
+  APInt Mask(InputBits, ((IntWidth << (InputBits - Shift)) - 1) << Shift);
+  unsigned Matched = 0;
+
+  for (unsigned i = 0; i < Length; i++) {
+    uint64_t Element = Table.getElementAsInteger(i);
+    if (Element < InputBits &&
+        (((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)
+      Matched++;
+  }
+
+  return Matched == InputBits;
+}
+
+// Try to recognize table-based ctz implementation.
+// E.g., an example in C (for more cases please see the llvm/tests):
+// int f(unsigned x) {
+//    static const char table[32] =
+//      {0, 1, 28, 2, 29, 14, 24, 3, 30,
+//       22, 20, 15, 25, 17, 4, 8, 31, 27,
+//       13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
+//    return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
+// }
+// this can be lowered to `cttz` instruction.
+// There is also a special case when the element is 0.
+//
+// Here are some examples or IR for AARCH64 target:
+//
+// CASE 1:
+// %sub = sub i32 0, %x
+// %and = and i32 %sub, %x
+// %mul = mul i32 %and, 125613361
+// %shr = lshr i32 %mul, 27
+// %idxprom = zext i32 %shr to i64
+// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0,
+// i64 %idxprom %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
+//
+// CASE 2:
+// %sub = sub i32 0, %x
+// %and = and i32 %sub, %x
+// %mul = mul i32 %and, 72416175
+// %shr = lshr i32 %mul, 26
+// %idxprom = zext i32 %shr to i64
+// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table, i64
+// 0, i64 %idxprom %0 = load i16, i16* %arrayidx, align 2, !tbaa !8
+//
+// CASE 3:
+// %sub = sub i32 0, %x
+// %and = and i32 %sub, %x
+// %mul = mul i32 %and, 81224991
+// %shr = lshr i32 %mul, 27
+// %idxprom = zext i32 %shr to i64
+// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table, i64
+// 0, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4, !tbaa !8
+//
+// CASE 4:
+// %sub = sub i64 0, %x
+// %and = and i64 %sub, %x
+// %mul = mul i64 %and, 283881067100198605
+// %shr = lshr i64 %mul, 58
+// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0, i64
+// %shr %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
+//
+// All this can be lowered to @llvm.cttz.i32/64 intrinsic.
+static bool tryToRecognizeTableBasedCttz(Instruction &I) {
+  LoadInst *LI = dyn_cast<LoadInst>(&I);
+  if (!LI)
+    return false;
+
+  Type *ElType = LI->getPointerOperandType()->getPointerElementType();
+  if (!ElType->isIntegerTy())
+    return false;
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP || !GEP->isInBounds() || GEP->getNumIndices() != 2)
+    return false;
+
+  Type *GEPPointeeType = GEP->getPointerOperandType()->getPointerElementType();
+  if (!GEPPointeeType->isArrayTy())
+    return false;
+
+  uint64_t ArraySize = GEPPointeeType->getArrayNumElements();
+  if (ArraySize != 32 && ArraySize != 64)
+    return false;
+
+  User *GEPUser = dyn_cast<User>(GEP->getPointerOperand());
+  if (!GEPUser)
+    return false;
+
+  ConstantDataArray *ConstData =
+      dyn_cast<ConstantDataArray>(GEPUser->getOperand(0));
+  if (!ConstData)
+    return false;
+
+  Value *Idx1 = GEP->idx_begin()->get();
+  Constant *Zero = dyn_cast<Constant>(Idx1);
+  if (!Zero || !Zero->isZeroValue())
+    return false;
+
+  Value *Idx2 = std::next(GEP->idx_begin())->get();
+
+  bool ConstIsWide = !match(Idx2, m_ZExt(m_Value()));
+
+  Value *X1;
+  uint64_t MulConst, ShiftConst;
+  // FIXME: AArch64 has i64 type for the GEP index, so this match will
+  // probably fail for other targets.
+  if (!match(Idx2,
+             m_ZExtOrSelf(m_LShr(
+                 m_ZExtOrSelf(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
+                                    m_ConstantInt(MulConst))),
+                 m_ConstantInt(ShiftConst)))))
+    return false;
+
+  unsigned InputBits = ConstIsWide ? 64 : 32;
+
+  // Shift should extract top 5..7 bits.
+  if (ShiftConst < InputBits - 7 || ShiftConst > InputBits - 5)
+    return false;
+
+  Type *XType = X1->getType();
+  if (!XType->isIntegerTy(InputBits))
+    return false;
+
+  if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))
+    return false;
+
+  auto ZeroTableElem = ConstData->getElementAsInteger(0);
+  bool DefinedForZero = ZeroTableElem == InputBits;
+
+  IRBuilder<> B(LI);
+  ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
+  auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst});
+  Value *ZExtOrTrunc = nullptr;
+  ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, ElType);
+  LI->replaceAllUsesWith(ZExtOrTrunc);
+
+  return true;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
-static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
+static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
+    TargetTransformInfo &TTI) {
   bool MadeChange = false;
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
@@ -382,6 +530,8 @@
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I);
+      if (TTI.preferCTTZLowering())
+        MadeChange |= tryToRecognizeTableBasedCttz(I);
     }
   }
 
@@ -396,12 +546,12 @@
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 static bool runImpl(Function &F, AssumptionCache &AC, TargetLibraryInfo &TLI,
-                    DominatorTree &DT) {
+                    DominatorTree &DT, TargetTransformInfo &TTI) {
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   TruncInstCombine TIC(AC, TLI, DL, DT);
   MadeChange |= TIC.run(F);
-  MadeChange |= foldUnusualPatterns(F, DT);
+  MadeChange |= foldUnusualPatterns(F, DT, TTI);
   return MadeChange;
 }
 
@@ -410,6 +560,7 @@
   AU.setPreservesCFG();
   AU.addRequired<AssumptionCacheTracker>();
   AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addPreserved<AAResultsWrapperPass>();
   AU.addPreserved<BasicAAWrapperPass>();
@@ -421,7 +572,8 @@
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  return runImpl(F, AC, TLI, DT);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return runImpl(F, AC, TLI, DT, TTI);
 }
 
 PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
@@ -429,7 +581,8 @@
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  if (!runImpl(F, AC, TLI, DT)) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!runImpl(F, AC, TLI, DT, TTI)) {
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
   }
@@ -445,6 +598,7 @@
                       "Combine pattern based expressions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
                     "Combine pattern based expressions", false, false)
Index: llvm/test/Transforms/AggressiveInstCombine/AArch64/lit.local.cfg
===================================================================
--- /dev/null
+++ llvm/test/Transforms/AggressiveInstCombine/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
Index: llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-ctz.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-ctz.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -aggressive-instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
+
+@f.table = internal unnamed_addr constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
+
+define i32 @f(i32 %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @f.table, i64 0, i64 %idxprom
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}