Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -594,6 +594,10 @@ /// considered as "Slow". enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware }; + /// Return true if the target would benefit from recognizing a table-based + /// implementation of cttz. + bool preferCTTZLowering() const; + /// Return true if the specified immediate is legal add immediate, that /// is the target has add instructions which can add a register with the /// immediate without having to materialize the immediate into a register. @@ -1530,6 +1534,7 @@ APInt &UndefElts2, APInt &UndefElts3, std::function SimplifyAndSetOp) = 0; + virtual bool preferCTTZLowering() = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1908,6 +1913,9 @@ IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, SimplifyAndSetOp); } + bool preferCTTZLowering() override { + return Impl.preferCTTZLowering(); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -200,6 +200,8 @@ void getPeelingPreferences(Loop *, ScalarEvolution &, TTI::PeelingPreferences &) const {} + bool preferCTTZLowering() const { return false; } + bool isLegalAddImmediate(int64_t Imm) const { return false; } bool isLegalICmpImmediate(int64_t Imm) const { return false; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -293,6 +293,10 @@ return nullptr; } + bool preferCTTZLowering() { + return getTLI()->preferCTTZLowering(); + } + bool isLegalAddImmediate(int64_t imm) { return getTLI()->isLegalAddImmediate(imm); } Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2438,6 +2438,12 @@ return true; } + /// Return true if the target would benefit from recognizing a table-based + /// implementation of cttz. + virtual bool preferCTTZLowering() const { + return false; + } + /// Return true if the specified immediate is legal add immediate, that is the /// target has add instructions which can add a register with the immediate /// without having to materialize the immediate into a register. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -333,6 +333,10 @@ return TTIImpl->getPeelingPreferences(L, SE, PP); } +bool TargetTransformInfo::preferCTTZLowering() const { + return TTIImpl->preferCTTZLowering(); +} + bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const { return TTIImpl->isLegalAddImmediate(Imm); } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -607,6 +607,7 @@ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool preferCTTZLowering() const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12830,6 +12830,10 @@ return LLT(); } +bool AArch64TargetLowering::preferCTTZLowering() const { + return true; +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits::min()) { Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2042,9 +2042,9 @@ def REV16Xr : OneXRegData<0b001, "rev16", null_frag>; def : Pat<(cttz GPR32:$Rn), - (CLZWr (RBITWr GPR32:$Rn))>; + (ANDWri (CLZWr (RBITWr GPR32:$Rn)), (i32 4))>; def : Pat<(cttz GPR64:$Rn), - (CLZXr (RBITXr GPR64:$Rn))>; + (ANDXri (CLZXr (RBITXr GPR64:$Rn)), (i64 5))>; def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)), (i32 1))), (CLSWr GPR32:$Rn)>; Index: llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -363,10 +364,157 @@ return false; } +static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul, + uint64_t Shift, uint64_t InputBits) { + unsigned Length = Table.getNumElements(); + uint64_t IntWidth = InputBits / CHAR_BIT; + if (Length < InputBits || Length > InputBits * 2) + return false; + + APInt Mask(InputBits, ((IntWidth << (InputBits - Shift)) - 1) << Shift); + unsigned Matched = 0; + + for (unsigned i = 0; i < Length; i++) { + uint64_t Element = Table.getElementAsInteger(i); + if (Element < InputBits && + (((Mul << Element) & Mask.getZExtValue()) >> Shift) == i) + Matched++; + } + + return Matched == InputBits; +} + +// Try to recognize table-based ctz implementation. +// E.g., an example in C (for more cases please see the llvm/tests): +// int f(unsigned x) { +// static const char table[32] = +// {0, 1, 28, 2, 29, 14, 24, 3, 30, +// 22, 20, 15, 25, 17, 4, 8, 31, 27, +// 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; +// return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27]; +// } +// this can be lowered to `cttz` instruction. +// There is also a special case when the element is 0. +// +// Here are some examples or IR for AARCH64 target: +// +// CASE 1: +// %sub = sub i32 0, %x +// %and = and i32 %sub, %x +// %mul = mul i32 %and, 125613361 +// %shr = lshr i32 %mul, 27 +// %idxprom = zext i32 %shr to i64 +// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0, +// i64 %idxprom %0 = load i8, i8* %arrayidx, align 1, !tbaa !8 +// +// CASE 2: +// %sub = sub i32 0, %x +// %and = and i32 %sub, %x +// %mul = mul i32 %and, 72416175 +// %shr = lshr i32 %mul, 26 +// %idxprom = zext i32 %shr to i64 +// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table, i64 +// 0, i64 %idxprom %0 = load i16, i16* %arrayidx, align 2, !tbaa !8 +// +// CASE 3: +// %sub = sub i32 0, %x +// %and = and i32 %sub, %x +// %mul = mul i32 %and, 81224991 +// %shr = lshr i32 %mul, 27 +// %idxprom = zext i32 %shr to i64 +// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table, i64 +// 0, i64 %idxprom %0 = load i32, i32* %arrayidx, align 4, !tbaa !8 +// +// CASE 4: +// %sub = sub i64 0, %x +// %and = and i64 %sub, %x +// %mul = mul i64 %and, 283881067100198605 +// %shr = lshr i64 %mul, 58 +// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0, i64 +// %shr %0 = load i8, i8* %arrayidx, align 1, !tbaa !8 +// +// All this can be lowered to @llvm.cttz.i32/64 intrinsic. +static bool tryToRecognizeTableBasedCttz(Instruction &I) { + LoadInst *LI = dyn_cast(&I); + if (!LI) + return false; + + Type *ElType = LI->getPointerOperandType()->getPointerElementType(); + if (!ElType->isIntegerTy()) + return false; + + GetElementPtrInst *GEP = dyn_cast(LI->getPointerOperand()); + if (!GEP || !GEP->isInBounds() || GEP->getNumIndices() != 2) + return false; + + Type *GEPPointeeType = GEP->getPointerOperandType()->getPointerElementType(); + if (!GEPPointeeType->isArrayTy()) + return false; + + uint64_t ArraySize = GEPPointeeType->getArrayNumElements(); + if (ArraySize != 32 && ArraySize != 64) + return false; + + User *GEPUser = dyn_cast(GEP->getPointerOperand()); + if (!GEPUser) + return false; + + ConstantDataArray *ConstData = + dyn_cast(GEPUser->getOperand(0)); + if (!ConstData) + return false; + + Value *Idx1 = GEP->idx_begin()->get(); + Constant *Zero = dyn_cast(Idx1); + if (!Zero || !Zero->isZeroValue()) + return false; + + Value *Idx2 = std::next(GEP->idx_begin())->get(); + + bool ConstIsWide = !match(Idx2, m_ZExt(m_Value())); + + Value *X1; + uint64_t MulConst, ShiftConst; + // FIXME: AArch64 has i64 type for the GEP index, so this match will + // probably fail for other targets. + if (!match(Idx2, + m_ZExtOrSelf(m_LShr( + m_ZExtOrSelf(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), + m_ConstantInt(MulConst))), + m_ConstantInt(ShiftConst))))) + return false; + + unsigned InputBits = ConstIsWide ? 64 : 32; + + // Shift should extract top 5..7 bits. + if (ShiftConst < InputBits - 7 || ShiftConst > InputBits - 5) + return false; + + Type *XType = X1->getType(); + if (!XType->isIntegerTy(InputBits)) + return false; + + if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits)) + return false; + + auto ZeroTableElem = ConstData->getElementAsInteger(0); + bool DefinedForZero = ZeroTableElem == InputBits; + + IRBuilder<> B(LI); + ConstantInt *BoolConst = B.getInt1(!DefinedForZero); + auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst}); + Value *ZExtOrTrunc = nullptr; + ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, ElType); + LI->replaceAllUsesWith(ZExtOrTrunc); + + return true; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. -static bool foldUnusualPatterns(Function &F, DominatorTree &DT) { +static bool foldUnusualPatterns(Function &F, DominatorTree &DT, + TargetTransformInfo &TTI) { bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. @@ -382,6 +530,8 @@ MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedFunnelShift(I, DT); MadeChange |= tryToRecognizePopCount(I); + if (TTI.preferCTTZLowering()) + MadeChange |= tryToRecognizeTableBasedCttz(I); } } @@ -396,12 +546,12 @@ /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, AssumptionCache &AC, TargetLibraryInfo &TLI, - DominatorTree &DT) { + DominatorTree &DT, TargetTransformInfo &TTI) { bool MadeChange = false; const DataLayout &DL = F.getParent()->getDataLayout(); TruncInstCombine TIC(AC, TLI, DL, DT); MadeChange |= TIC.run(F); - MadeChange |= foldUnusualPatterns(F, DT); + MadeChange |= foldUnusualPatterns(F, DT, TTI); return MadeChange; } @@ -410,6 +560,7 @@ AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); @@ -421,7 +572,8 @@ auto &AC = getAnalysis().getAssumptionCache(F); auto &TLI = getAnalysis().getTLI(F); auto &DT = getAnalysis().getDomTree(); - return runImpl(F, AC, TLI, DT); + auto &TTI = getAnalysis().getTTI(F); + return runImpl(F, AC, TLI, DT, TTI); } PreservedAnalyses AggressiveInstCombinePass::run(Function &F, @@ -429,7 +581,8 @@ auto &AC = AM.getResult(F); auto &TLI = AM.getResult(F); auto &DT = AM.getResult(F); - if (!runImpl(F, AC, TLI, DT)) { + auto &TTI = AM.getResult(F); + if (!runImpl(F, AC, TLI, DT, TTI)) { // No changes, all analyses are preserved. return PreservedAnalyses::all(); } @@ -445,6 +598,7 @@ "Combine pattern based expressions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine", "Combine pattern based expressions", false, false) Index: llvm/test/Transforms/AggressiveInstCombine/AArch64/lit.local.cfg =================================================================== --- /dev/null +++ llvm/test/Transforms/AggressiveInstCombine/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True Index: llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-ctz.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-ctz.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -aggressive-instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s + +@f.table = internal unnamed_addr constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1 + +define i32 @f(i32 %x) { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %sub = sub i32 0, %x + %and = and i32 %sub, %x + %mul = mul i32 %and, 125613361 + %shr = lshr i32 %mul, 27 + %idxprom = zext i32 %shr to i64 + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @f.table, i64 0, i64 %idxprom + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + ret i32 %conv +}