Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -71,6 +71,10 @@ bool MadeIRChange = false; + // Handle bit counting patterns. + BitCountCombine BCC(TLI, DL); + MadeIRChange |= BCC.run(F); + // Handle TruncInst patterns TruncInstCombine TIC(TLI, DL, DT); MadeIRChange |= TIC.run(F); @@ -87,6 +91,10 @@ auto &DL = F.getParent()->getDataLayout(); bool MadeIRChange = false; + // Handle bit counting patterns. + BitCountCombine BCC(TLI, DL); + MadeIRChange |= BCC.run(F); + // Handle TruncInst patterns TruncInstCombine TIC(TLI, DL, DT); MadeIRChange |= TIC.run(F); Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h @@ -118,4 +118,28 @@ /// \param SclTy scalar version of new type to reduce expression dag into. void ReduceExpressionDag(Type *SclTy); }; + +//===----------------------------------------------------------------------===// +// BitCountCombine - looks for code that does to the population count and +// count leading zeros that corresponds to the typical bit-twiddling algorithms +// from Hacker's Delight. +//===----------------------------------------------------------------------===// + +class BitCountCombine { +public: + BitCountCombine(const TargetLibraryInfo &TLI, const DataLayout &DL) + : TLI(TLI), DL(DL) {} + + bool run(Function &F); + +private: + Value *matchCtpopW(Instruction &In, unsigned BW); + Value *optimizeToCtpop(Instruction &In); + Value *optimizeToCtlz(Instruction &In); + bool runOnBlock(BasicBlock &B); + + const TargetLibraryInfo &TLI; + const DataLayout &DL; +}; + } // end namespace llvm. Index: lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp =================================================================== --- /dev/null +++ lib/Transforms/AggressiveInstCombine/BitCountCombine.cpp @@ -0,0 +1,296 @@ +//===- BitCountCombine.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This code looks for code calculates the population count and the leading +// zeros count using the bit-manipulation methods from Hacker's Delight. +// +// ctpop(n): +// bw = bitwidth(n) +// n = (n & 0x01010101..0101) + (n & 0x10101010..1010 >> 1) +// n = (n & 0x00110011..0011) + (n & 0x11001100..1100 >> 2) +// ... +// n = (n & 0x000000..111111) + (n & 0x111111..000000 >> bw/2) +// return n +// +// ctlz(n): +// bw = bitwidth(n) +// n = n | (n >> 1) +// n = n | (n >> 2) +// n = n | (n >> 4) +// ... +// n = n | (n >> bw/2) +// return bw - ctpop(n) +// +//===----------------------------------------------------------------------===// + +#include "AggressiveInstCombineInternal.h" +#include "llvm/Analysis/Utils/Local.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +// Check if In matches a ctpop calculation pattern for a value of width BW +// bits. If so, return the argument V such that ctpop(V) would be a candidate +// for replacing In. +// The matched pattern is: +// x0 := (V & 0x55..5) + ((V>>1) & 0x55..5) +// x1 := (x0 & 0x33..3) + ((x0>>2) & 0x33..3) +// ... +// xn := (xn-1 & 0x00..0FF..F) + ((xn-1>>S/2) & 0x00..0FF..F) +// where xn is the candidate for ctpop(V). +Value *BitCountCombine::matchCtpopW(Instruction &In, unsigned BW) { + auto matchStep = [] (Value *V, unsigned S, const APInt &M, bool ShiftAlone) + -> Value* { + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(V, m_Add(m_Value(Op0), m_Value(Op1)))) + return nullptr; + + auto matchAndShift = [S,&M,ShiftAlone] (Value *V0, Value *V1) -> Value* { + Value *V = nullptr; + const APInt *P; + auto Mask = m_APInt(P); + auto Shift = m_SpecificInt(S); + + if (!match(V0, m_And(m_Value(V), Mask))) + return nullptr; + if (ShiftAlone) { + if (!match(V1, m_LShr(m_Specific(V), Shift))) + return nullptr; + } else { + if (!match(V1, m_And(m_LShr(m_Specific(V), Shift), Mask))) + return nullptr; + } + if (!APInt::isSameValue(M, *P)) + return nullptr; + return V; + }; + + if (Value *T = matchAndShift(Op0, Op1)) + return T; + if (Value *T = matchAndShift(Op1, Op0)) + return T; + return nullptr; + }; + + // Generate the bitmask for the & operation. BW is the bit-width of the + // entire mask. The masks are: + // 0b01010101..01010101 0x55..55 1 bit every 2 bits + // 0b00110011..00110011 0x33..35 2 bits every 4 bits + // 0b00000111..00000111 0x07..07 3 bits every 8 bits + // ... ... logS bits every S bits + // Normally the masks would be 01010101, 00110011, 00001111, i.e. the + // number of contiguous 1 bits in each group would be twice the number + // in the previous mask, but by the time this code runs, the "demanded" + // bits have been optimized to only require one more 1 bit in each + // subsequent mask. This function generates the post-optimized masks. + auto getMask = [] (unsigned S, unsigned BW) -> APInt { + assert(isPowerOf2_32(S)); + APInt M(S, S-1); + return APInt::getSplat(BW, M); + }; + + Value *V = &In; + bool ShiftAlone = true; + unsigned N = BW; + while (N > 1) { + unsigned S = N/2; + V = matchStep(V, S, getMask(N, BW), ShiftAlone); + if (!V) + return nullptr; + N = S; + ShiftAlone = false; + } + + return V; +} + +// If In is an expression that evaluates popcnt via shift/add pattern, +// return the equivalent expression using the ctpop intrinsic. Otherwise +// return nullptr. +Value *BitCountCombine::optimizeToCtpop(Instruction &In) { + IntegerType *Ty = dyn_cast(In.getType()); + if (!Ty) + return nullptr; + + // Take the first shift amount feeding the add, and assume this is the + // last shift in the popcnt computation. + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(&In, m_Add(m_Value(Op0), m_Value(Op1)))) + return nullptr; + + // Shift by half-width. + uint64_t SH = 0; + if (!match(Op0, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) && + !match(Op1, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) && + !match(Op0, m_LShr(m_Value(), m_ConstantInt(SH))) && + !match(Op1, m_LShr(m_Value(), m_ConstantInt(SH)))) + return nullptr; + + if (SH < 4 || !isPowerOf2_64(SH)) + return nullptr; + + Value *V = matchCtpopW(In, 2*SH); + if (!V) + return nullptr; + + unsigned TW = Ty->getBitWidth(), BW = 2*SH; + if (BW < TW) { + // BW is the bit width of the expression whose population count is + // being calculated. TW is the bit width of the type associated with + // that expression. Usually they are the same, but for ctpop8 the + // type may be "unsigned", i.e. 32-bit, while the ctpop8 would only + // consider the low 8 bits. In that case BW=8 and TW=32. + KnownBits K(TW); + computeKnownBits(V, K, DL); + APInt Need0 = APInt::getBitsSet(TW, BW, TW); + if ((K.Zero & Need0) != Need0) + return nullptr; + } + + IRBuilder<> Builder(&In); + Module *M = In.getParent()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, {V->getType()}); + CallInst *CI = Builder.CreateCall(Func, {V}); + CI->setDebugLoc(In.getDebugLoc()); + return CI; +} + +Value *BitCountCombine::optimizeToCtlz(Instruction &In) { + // Let bw = bitwidth(n), + // convert + // n = n | (n>>1) + // n = n | (n>>2) + // n = n | (n>>4) + // ... + // n = n | (n>>bw/2) + // bw - ctpop(n) + // to + // ctlz(n). + // This code expects that the ctpop intrinsic has already been generated. + + // Skip the check for the subtract, since it may have been folded into + // another computation. Start the checks at the ctpop intrinsic. + if (!match(&In, m_Intrinsic())) + return nullptr; + + // Get the argument of the ctpop. + Value *V = In.getOperand(0); + + // The argument to ctpop can be zero-extended in some cases. It is safe + // to ignore the zext. + if (auto *Z = dyn_cast(V)) + V = Z->getOperand(0); + + // A pre-existing ctpop intrinsic can return a vector. + IntegerType *Ty = dyn_cast(V->getType()); + if (!Ty) + return nullptr; + + KnownBits K(Ty->getBitWidth()); + computeKnownBits(V, K, DL); + unsigned BW = Ty->getBitWidth() - K.One.countLeadingOnes(); + if (!isPowerOf2_32(BW)) + BW = NextPowerOf2(BW); + + auto matchOrShift = [] (Value *V, unsigned S) -> Value* { + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(V, m_Or(m_Value(Op0), m_Value(Op1)))) + return nullptr; + if (match(Op0, m_LShr(m_Specific(Op1), m_SpecificInt(S)))) + return Op1; + if (match(Op1, m_LShr(m_Specific(Op0), m_SpecificInt(S)))) + return Op0; + return nullptr; + }; + + // The bitwidth of the input expression will be detected iteratively + // by trying the initial match with increasing widths. The bitwidth + // cannot be shorter than the expression whose ctpop was calculated, + // since the leading zero calculation would include bits not accounted + // for by the algorithm. Limit the bitwidth search to 64 bits. + while (!matchOrShift(V, BW/2) && BW <= 64) + BW *= 2; + if (BW > 64) + return nullptr; + + unsigned N = BW; + while (N > 1) { + N /= 2; + V = matchOrShift(V, N); + if (!V) + return nullptr; + } + + // The value of BW is the one that determines the type of ctlz's argument. + IRBuilder<> Builder(&In); + if (BW > Ty->getBitWidth()) { + IntegerType *ATy = IntegerType::get(In.getContext(), BW); + V = Builder.CreateZExt(V, ATy); + } + Module *M = In.getParent()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, {V->getType()}); + Value *False = ConstantInt::getFalse(In.getContext()); + Instruction *CI = Builder.CreateCall(Func, {V, False}); + CI->setDebugLoc(In.getDebugLoc()); + Value *Ext = In.getType() != CI->getType() + ? Builder.CreateZExt(CI, In.getType()) + : CI; + + // The actual ctlz(n) is "bw - ctpop", which is what "Ext" is here. We've + // ignored the "bw - ..." part, however, so fix it up before returning. + return Builder.CreateSub(ConstantInt::get(In.getType(), BW), Ext, "", + true, true); +} + +bool BitCountCombine::runOnBlock(BasicBlock &B) { + bool Changed = false; + + // Iterate over the block as long as there are more intrinsics generated. + while (true) { + Value *Int = nullptr; + for (Instruction &In : reverse(B)) { + Int = optimizeToCtpop(In); + if (!Int) + Int = optimizeToCtlz(In); + if (Int) { + Changed = true; + In.replaceAllUsesWith(Int); + RecursivelyDeleteTriviallyDeadInstructions(&In, &TLI); + break; + } + } + if (!Int) + break; + } + + return Changed; +} + +bool BitCountCombine::run(Function &F) { + // Avoid optimizing compiler-rt functions that do the same thing. + // If the intrinsics expand to calls to compiler-rt, this could + // cause an infinite loop. + StringRef N = F.getName(); + // Check for the name __popcount.i2: + if (N.size() == 13 && N.startswith("__popcount") && N.endswith("i2")) + return false; + // Check for the name __clz.i2: + if (N.size() == 8 && N.startswith("__clz") && N.endswith("i2")) + return false; + + bool Changed = false; + for (BasicBlock &B : F) + Changed |= runOnBlock(B); + + return Changed; +} Index: lib/Transforms/AggressiveInstCombine/CMakeLists.txt =================================================================== --- lib/Transforms/AggressiveInstCombine/CMakeLists.txt +++ lib/Transforms/AggressiveInstCombine/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMAggressiveInstCombine AggressiveInstCombine.cpp + BitCountCombine.cpp TruncInstCombine.cpp ADDITIONAL_HEADER_DIRS Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -319,8 +319,8 @@ MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Combine silly seq's - if (OptLevel > 2) - MPM.add(createAggressiveInstCombinerPass()); +// if (OptLevel > 2) +// MPM.add(createAggressiveInstCombinerPass()); addInstructionCombiningPass(MPM); if (SizeLevel == 0 && !DisableLibCallsShrinkWrap) MPM.add(createLibCallsShrinkWrapPass()); @@ -470,6 +470,8 @@ addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); if (OptLevel > 2) + MPM.add(createAggressiveInstCombinerPass()); + if (OptLevel > 2) MPM.add(createCallSiteSplittingPass()); MPM.add(createIPSCCPPass()); // IP SCCP Index: test/Transforms/AggressiveInstCombine/ctlz-combine.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/ctlz-combine.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s + +; unsigned ctlz16(unsigned short t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555); +; unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333); +; unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F); +; unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF); +; return 16-t4; +; } +; +define i32 @ctlz16(i16 zeroext %a0) local_unnamed_addr #0 { +; CHECK-LABEL: @ctlz16( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[A0:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = sub nuw nsw i32 16, [[TMP1]] +; CHECK-NEXT: [[V24:%.*]] = sub nsw i32 16, [[TMP2]] +; CHECK-NEXT: ret i32 [[V24]] +; +b0: + %v0 = lshr i16 %a0, 1 + %v1 = or i16 %v0, %a0 + %v2 = lshr i16 %v1, 2 + %v3 = or i16 %v2, %v1 + %v4 = lshr i16 %v3, 4 + %v5 = or i16 %v4, %v3 + %v6 = lshr i16 %v5, 8 + %v7 = or i16 %v6, %v5 + %v8 = zext i16 %v7 to i32 + %v9 = and i32 %v8, 21845 + %v10 = lshr i32 %v8, 1 + %v11 = and i32 %v10, 21845 + %v12 = add nuw nsw i32 %v9, %v11 + %v13 = and i32 %v12, 13107 + %v14 = lshr i32 %v12, 2 + %v15 = and i32 %v14, 13107 + %v16 = add nuw nsw i32 %v13, %v15 + %v17 = and i32 %v16, 1799 + %v18 = lshr i32 %v16, 4 + %v19 = and i32 %v18, 1799 + %v20 = add nuw nsw i32 %v17, %v19 + %v21 = and i32 %v20, 15 + %v22 = lshr i32 %v20, 8 + %v23 = add nuw nsw i32 %v21, %v22 + %v24 = sub nsw i32 16, %v23 + ret i32 %v24 +} + +; unsigned ctlz32(unsigned t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; t0 = t0 | (t0>>16); +; unsigned t1 = (t0 & 0x55555555) + ((t0>>1) & 0x55555555); +; unsigned t2 = (t1 & 0x33333333) + ((t1>>2) & 0x33333333); +; unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4) & 0x0F0F0F0F); +; unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8) & 0x00FF00FF); +; unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF); +; return 32-t5; +; } +; +define i32 @ctlz32(i32 %a0) local_unnamed_addr #1 { +; CHECK-LABEL: @ctlz32( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[A0:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i32 32, [[TMP0]] +; CHECK-NEXT: [[V29:%.*]] = sub nsw i32 32, [[TMP1]] +; CHECK-NEXT: ret i32 [[V29]] +; +b0: + %v0 = lshr i32 %a0, 1 + %v1 = or i32 %v0, %a0 + %v2 = lshr i32 %v1, 2 + %v3 = or i32 %v1, %v2 + %v4 = lshr i32 %v3, 4 + %v5 = or i32 %v3, %v4 + %v6 = lshr i32 %v5, 8 + %v7 = or i32 %v5, %v6 + %v8 = lshr i32 %v7, 16 + %v9 = or i32 %v7, %v8 + %v10 = and i32 %v9, 1431655765 + %v11 = lshr i32 %v9, 1 + %v12 = and i32 %v11, 1431655765 + %v13 = add nuw i32 %v10, %v12 + %v14 = and i32 %v13, 858993459 + %v15 = lshr i32 %v13, 2 + %v16 = and i32 %v15, 858993459 + %v17 = add nuw nsw i32 %v14, %v16 + %v18 = and i32 %v17, 117901063 + %v19 = lshr i32 %v17, 4 + %v20 = and i32 %v19, 117901063 + %v21 = add nuw nsw i32 %v18, %v20 + %v22 = and i32 %v21, 983055 + %v23 = lshr i32 %v21, 8 + %v24 = and i32 %v23, 983055 + %v25 = add nuw nsw i32 %v22, %v24 + %v26 = and i32 %v25, 31 + %v27 = lshr i32 %v25, 16 + %v28 = add nuw nsw i32 %v26, %v27 + %v29 = sub nsw i32 32, %v28 + ret i32 %v29 +} + +; typedef unsigned long long u64_t; +; u64_t ctlz64(u64_t t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; t0 = t0 | (t0>>16); +; t0 = t0 | (t0>>32); +; u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1) & 0x5555555555555555LL); +; u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2) & 0x3333333333333333LL); +; u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4) & 0x0F0F0F0F0F0F0F0FLL); +; u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8) & 0x00FF00FF00FF00FFLL); +; u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL); +; u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL); +; return 64-t6; +; } +; +define i64 @ctlz64(i64 %a0) local_unnamed_addr #1 { +; CHECK-LABEL: @ctlz64( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A0:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i64 64, [[TMP0]] +; CHECK-NEXT: [[V35:%.*]] = sub nsw i64 64, [[TMP1]] +; CHECK-NEXT: ret i64 [[V35]] +; +b0: + %v0 = lshr i64 %a0, 1 + %v1 = or i64 %v0, %a0 + %v2 = lshr i64 %v1, 2 + %v3 = or i64 %v1, %v2 + %v4 = lshr i64 %v3, 4 + %v5 = or i64 %v3, %v4 + %v6 = lshr i64 %v5, 8 + %v7 = or i64 %v5, %v6 + %v8 = lshr i64 %v7, 16 + %v9 = or i64 %v7, %v8 + %v10 = lshr i64 %v9, 32 + %v11 = or i64 %v9, %v10 + %v12 = and i64 %v11, 6148914691236517205 + %v13 = lshr i64 %v11, 1 + %v14 = and i64 %v13, 6148914691236517205 + %v15 = add nuw i64 %v12, %v14 + %v16 = and i64 %v15, 3689348814741910323 + %v17 = lshr i64 %v15, 2 + %v18 = and i64 %v17, 3689348814741910323 + %v19 = add nuw nsw i64 %v16, %v18 + %v20 = and i64 %v19, 506381209866536711 + %v21 = lshr i64 %v19, 4 + %v22 = and i64 %v21, 506381209866536711 + %v23 = add nuw nsw i64 %v20, %v22 + %v24 = and i64 %v23, 4222189076152335 + %v25 = lshr i64 %v23, 8 + %v26 = and i64 %v25, 4222189076152335 + %v27 = add nuw nsw i64 %v24, %v26 + %v28 = and i64 %v27, 133143986207 + %v29 = lshr i64 %v27, 16 + %v30 = and i64 %v29, 133143986207 + %v31 = add nuw nsw i64 %v28, %v30 + %v32 = and i64 %v31, 63 + %v33 = lshr i64 %v31, 32 + %v34 = add nuw nsw i64 %v32, %v33 + %v35 = sub nsw i64 64, %v34 + ret i64 %v35 +} + +attributes #0 = { norecurse nounwind readnone uwtable } +attributes #1 = { nounwind uwtable } Index: test/Transforms/AggressiveInstCombine/ctlz-rt.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/ctlz-rt.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s + +; Make sure that the intrinsic is not generated for compiler-rt functions. + +define i32 @__clzsi2(i32 %a0) local_unnamed_addr #0 { +; CHECK-LABEL: @__clzsi2( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[V0:%.*]] = lshr i32 [[A0:%.*]], 1 +; CHECK-NEXT: [[V1:%.*]] = or i32 [[V0]], [[A0]] +; CHECK-NEXT: [[V2:%.*]] = lshr i32 [[V1]], 2 +; CHECK-NEXT: [[V3:%.*]] = or i32 [[V1]], [[V2]] +; CHECK-NEXT: [[V4:%.*]] = lshr i32 [[V3]], 4 +; CHECK-NEXT: [[V5:%.*]] = or i32 [[V3]], [[V4]] +; CHECK-NEXT: [[V6:%.*]] = lshr i32 [[V5]], 8 +; CHECK-NEXT: [[V7:%.*]] = or i32 [[V5]], [[V6]] +; CHECK-NEXT: [[V8:%.*]] = lshr i32 [[V7]], 16 +; CHECK-NEXT: [[V9:%.*]] = or i32 [[V7]], [[V8]] +; CHECK-NEXT: [[V10:%.*]] = and i32 [[V9]], 1431655765 +; CHECK-NEXT: [[V11:%.*]] = lshr i32 [[V9]], 1 +; CHECK-NEXT: [[V12:%.*]] = and i32 [[V11]], 1431655765 +; CHECK-NEXT: [[V13:%.*]] = add nuw i32 [[V10]], [[V12]] +; CHECK-NEXT: [[V14:%.*]] = and i32 [[V13]], 858993459 +; CHECK-NEXT: [[V15:%.*]] = lshr i32 [[V13]], 2 +; CHECK-NEXT: [[V16:%.*]] = and i32 [[V15]], 858993459 +; CHECK-NEXT: [[V17:%.*]] = add nuw nsw i32 [[V14]], [[V16]] +; CHECK-NEXT: [[V18:%.*]] = and i32 [[V17]], 117901063 +; CHECK-NEXT: [[V19:%.*]] = lshr i32 [[V17]], 4 +; CHECK-NEXT: [[V20:%.*]] = and i32 [[V19]], 117901063 +; CHECK-NEXT: [[V21:%.*]] = add nuw nsw i32 [[V18]], [[V20]] +; CHECK-NEXT: [[V22:%.*]] = and i32 [[V21]], 983055 +; CHECK-NEXT: [[V23:%.*]] = lshr i32 [[V21]], 8 +; CHECK-NEXT: [[V24:%.*]] = and i32 [[V23]], 983055 +; CHECK-NEXT: [[V25:%.*]] = add nuw nsw i32 [[V22]], [[V24]] +; CHECK-NEXT: [[V26:%.*]] = and i32 [[V25]], 31 +; CHECK-NEXT: [[V27:%.*]] = lshr i32 [[V25]], 16 +; CHECK-NEXT: [[V28:%.*]] = add nuw nsw i32 [[V26]], [[V27]] +; CHECK-NEXT: [[V29:%.*]] = sub nsw i32 32, [[V28]] +; CHECK-NEXT: ret i32 [[V29]] +; +b0: + %v0 = lshr i32 %a0, 1 + %v1 = or i32 %v0, %a0 + %v2 = lshr i32 %v1, 2 + %v3 = or i32 %v1, %v2 + %v4 = lshr i32 %v3, 4 + %v5 = or i32 %v3, %v4 + %v6 = lshr i32 %v5, 8 + %v7 = or i32 %v5, %v6 + %v8 = lshr i32 %v7, 16 + %v9 = or i32 %v7, %v8 + %v10 = and i32 %v9, 1431655765 + %v11 = lshr i32 %v9, 1 + %v12 = and i32 %v11, 1431655765 + %v13 = add nuw i32 %v10, %v12 + %v14 = and i32 %v13, 858993459 + %v15 = lshr i32 %v13, 2 + %v16 = and i32 %v15, 858993459 + %v17 = add nuw nsw i32 %v14, %v16 + %v18 = and i32 %v17, 117901063 + %v19 = lshr i32 %v17, 4 + %v20 = and i32 %v19, 117901063 + %v21 = add nuw nsw i32 %v18, %v20 + %v22 = and i32 %v21, 983055 + %v23 = lshr i32 %v21, 8 + %v24 = and i32 %v23, 983055 + %v25 = add nuw nsw i32 %v22, %v24 + %v26 = and i32 %v25, 31 + %v27 = lshr i32 %v25, 16 + %v28 = add nuw nsw i32 %v26, %v27 + %v29 = sub nsw i32 32, %v28 + ret i32 %v29 +} + +attributes #0 = { norecurse nounwind readnone } Index: test/Transforms/AggressiveInstCombine/ctpop-combine.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/ctpop-combine.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s + +; unsigned pop8(unsigned char t0) { +; unsigned t1 = (t0 & 0x55) + ((t0>>1) & 0x55); +; unsigned t2 = (t1 & 0x33) + ((t1>>2) & 0x33); +; unsigned t3 = (t2 & 0x0F) + ((t2>>4) & 0x0F); +; return t3; +; } +; +define i32 @pop8(i8 zeroext %a0) local_unnamed_addr #0 { +; CHECK-LABEL: @pop8( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[V0:%.*]] = zext i8 [[A0:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[V0]]) +; CHECK-NEXT: ret i32 [[TMP0]] +; +b0: + %v0 = zext i8 %a0 to i32 + %v1 = and i32 %v0, 85 + %v2 = lshr i32 %v0, 1 + %v3 = and i32 %v2, 85 + %v4 = add nuw nsw i32 %v1, %v3 + %v5 = and i32 %v4, 51 + %v6 = lshr i32 %v4, 2 + %v7 = and i32 %v6, 51 + %v8 = add nuw nsw i32 %v5, %v7 + %v9 = and i32 %v8, 7 + %v10 = lshr i32 %v8, 4 + %v11 = add nuw nsw i32 %v9, %v10 + ret i32 %v11 +} + +; unsigned pop16(unsigned short t0) { +; unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555); +; unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333); +; unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F); +; unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF); +; return t4; +; } +; +define i32 @pop16(i16 zeroext %a0) local_unnamed_addr #1 { +; CHECK-LABEL: @pop16( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[V0:%.*]] = zext i16 [[A0:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[V0]]) +; CHECK-NEXT: ret i32 [[TMP0]] +; +b0: + %v0 = zext i16 %a0 to i32 + %v1 = and i32 %v0, 21845 + %v2 = lshr i32 %v0, 1 + %v3 = and i32 %v2, 21845 + %v4 = add nuw nsw i32 %v1, %v3 + %v5 = and i32 %v4, 13107 + %v6 = lshr i32 %v4, 2 + %v7 = and i32 %v6, 13107 + %v8 = add nuw nsw i32 %v5, %v7 + %v9 = and i32 %v8, 1799 + %v10 = lshr i32 %v8, 4 + %v11 = and i32 %v10, 1799 + %v12 = add nuw nsw i32 %v9, %v11 + %v13 = and i32 %v12, 15 + %v14 = lshr i32 %v12, 8 + %v15 = add nuw nsw i32 %v13, %v14 + ret i32 %v15 +} + +; unsigned pop32(unsigned t0) { +; unsigned t1 = (t0 & 0x55555555) + ((t0>>1) & 0x55555555); +; unsigned t2 = (t1 & 0x33333333) + ((t1>>2) & 0x33333333); +; unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4) & 0x0F0F0F0F); +; unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8) & 0x00FF00FF); +; unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF); +; return t5; +; } +; +define i32 @pop32(i32 %a0) local_unnamed_addr #1 { +; CHECK-LABEL: @pop32( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A0:%.*]]) +; CHECK-NEXT: ret i32 [[TMP0]] +; +b0: + %v0 = and i32 %a0, 1431655765 + %v1 = lshr i32 %a0, 1 + %v2 = and i32 %v1, 1431655765 + %v3 = add nuw i32 %v0, %v2 + %v4 = and i32 %v3, 858993459 + %v5 = lshr i32 %v3, 2 + %v6 = and i32 %v5, 858993459 + %v7 = add nuw nsw i32 %v4, %v6 + %v8 = and i32 %v7, 117901063 + %v9 = lshr i32 %v7, 4 + %v10 = and i32 %v9, 117901063 + %v11 = add nuw nsw i32 %v8, %v10 + %v12 = and i32 %v11, 983055 + %v13 = lshr i32 %v11, 8 + %v14 = and i32 %v13, 983055 + %v15 = add nuw nsw i32 %v12, %v14 + %v16 = and i32 %v15, 31 + %v17 = lshr i32 %v15, 16 + %v18 = add nuw nsw i32 %v16, %v17 + ret i32 %v18 +} + +; typedef unsigned long long u64_t; +; u64_t pop64(u64_t t0) { +; u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1) & 0x5555555555555555LL); +; u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2) & 0x3333333333333333LL); +; u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4) & 0x0F0F0F0F0F0F0F0FLL); +; u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8) & 0x00FF00FF00FF00FFLL); +; u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL); +; u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL); +; return t6; +; } +; +define i64 @pop64(i64 %a0) local_unnamed_addr #1 { +; CHECK-LABEL: @pop64( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[A0:%.*]]) +; CHECK-NEXT: ret i64 [[TMP0]] +; +b0: + %v0 = and i64 %a0, 6148914691236517205 + %v1 = lshr i64 %a0, 1 + %v2 = and i64 %v1, 6148914691236517205 + %v3 = add nuw i64 %v0, %v2 + %v4 = and i64 %v3, 3689348814741910323 + %v5 = lshr i64 %v3, 2 + %v6 = and i64 %v5, 3689348814741910323 + %v7 = add nuw nsw i64 %v4, %v6 + %v8 = and i64 %v7, 506381209866536711 + %v9 = lshr i64 %v7, 4 + %v10 = and i64 %v9, 506381209866536711 + %v11 = add nuw nsw i64 %v8, %v10 + %v12 = and i64 %v11, 4222189076152335 + %v13 = lshr i64 %v11, 8 + %v14 = and i64 %v13, 4222189076152335 + %v15 = add nuw nsw i64 %v12, %v14 + %v16 = and i64 %v15, 133143986207 + %v17 = lshr i64 %v15, 16 + %v18 = and i64 %v17, 133143986207 + %v19 = add nuw nsw i64 %v16, %v18 + %v20 = and i64 %v19, 63 + %v21 = lshr i64 %v19, 32 + %v22 = add nuw nsw i64 %v20, %v21 + ret i64 %v22 +} + +; Negative test: one of the mask values is incorrect for ctpop. +define i32 @not_pop32(i32 %a0) local_unnamed_addr #1 { +; CHECK-LABEL: @not_pop32( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[V0:%.*]] = and i32 [[A0:%.*]], 1431655765 +; CHECK-NEXT: [[V1:%.*]] = lshr i32 [[A0]], 1 +; CHECK-NEXT: [[V2:%.*]] = and i32 [[V1]], 1431655765 +; CHECK-NEXT: [[V3:%.*]] = add nuw i32 [[V0]], [[V2]] +; CHECK-NEXT: [[V4:%.*]] = and i32 [[V3]], 858993459 +; CHECK-NEXT: [[V5:%.*]] = lshr i32 [[V3]], 2 +; CHECK-NEXT: [[V6:%.*]] = and i32 [[V5]], 858993459 +; CHECK-NEXT: [[V7:%.*]] = add nuw nsw i32 [[V4]], [[V6]] +; CHECK-NEXT: [[V8:%.*]] = and i32 [[V7]], 117902063 +; CHECK-NEXT: [[V9:%.*]] = lshr i32 [[V7]], 4 +; CHECK-NEXT: [[V10:%.*]] = and i32 [[V9]], 117902063 +; CHECK-NEXT: [[V11:%.*]] = add nuw nsw i32 [[V8]], [[V10]] +; CHECK-NEXT: [[V12:%.*]] = and i32 [[V11]], 983055 +; CHECK-NEXT: [[V13:%.*]] = lshr i32 [[V11]], 8 +; CHECK-NEXT: [[V14:%.*]] = and i32 [[V13]], 983055 +; CHECK-NEXT: [[V15:%.*]] = add nuw nsw i32 [[V12]], [[V14]] +; CHECK-NEXT: [[V16:%.*]] = and i32 [[V15]], 31 +; CHECK-NEXT: [[V17:%.*]] = lshr i32 [[V15]], 16 +; CHECK-NEXT: [[V18:%.*]] = add nuw nsw i32 [[V16]], [[V17]] +; CHECK-NEXT: ret i32 [[V18]] +; +b0: + %v0 = and i32 %a0, 1431655765 + %v1 = lshr i32 %a0, 1 + %v2 = and i32 %v1, 1431655765 + %v3 = add nuw i32 %v0, %v2 + %v4 = and i32 %v3, 858993459 + %v5 = lshr i32 %v3, 2 + %v6 = and i32 %v5, 858993459 + %v7 = add nuw nsw i32 %v4, %v6 + %v8 = and i32 %v7, 117902063 + %v9 = lshr i32 %v7, 4 + %v10 = and i32 %v9, 117902063 + %v11 = add nuw nsw i32 %v8, %v10 + %v12 = and i32 %v11, 983055 + %v13 = lshr i32 %v11, 8 + %v14 = and i32 %v13, 983055 + %v15 = add nuw nsw i32 %v12, %v14 + %v16 = and i32 %v15, 31 + %v17 = lshr i32 %v15, 16 + %v18 = add nuw nsw i32 %v16, %v17 + ret i32 %v18 +} + + +attributes #0 = { norecurse nounwind readnone } +attributes #1 = { nounwind uwtable } Index: test/Transforms/AggressiveInstCombine/ctpop-rt.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/ctpop-rt.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -aggressive-instcombine -S < %s | FileCheck %s + +; Make sure that the intrinsic is not generated for compiler-rt functions. + +define i32 @__popcountsi2(i32 %a0) local_unnamed_addr #0 { +; CHECK-LABEL: @__popcountsi2( +; CHECK-NEXT: b0: +; CHECK-NEXT: [[V0:%.*]] = and i32 [[A0:%.*]], 1431655765 +; CHECK-NEXT: [[V1:%.*]] = lshr i32 [[A0]], 1 +; CHECK-NEXT: [[V2:%.*]] = and i32 [[V1]], 1431655765 +; CHECK-NEXT: [[V3:%.*]] = add nuw i32 [[V0]], [[V2]] +; CHECK-NEXT: [[V4:%.*]] = and i32 [[V3]], 858993459 +; CHECK-NEXT: [[V5:%.*]] = lshr i32 [[V3]], 2 +; CHECK-NEXT: [[V6:%.*]] = and i32 [[V5]], 858993459 +; CHECK-NEXT: [[V7:%.*]] = add nuw nsw i32 [[V4]], [[V6]] +; CHECK-NEXT: [[V8:%.*]] = and i32 [[V7]], 117901063 +; CHECK-NEXT: [[V9:%.*]] = lshr i32 [[V7]], 4 +; CHECK-NEXT: [[V10:%.*]] = and i32 [[V9]], 117901063 +; CHECK-NEXT: [[V11:%.*]] = add nuw nsw i32 [[V8]], [[V10]] +; CHECK-NEXT: [[V12:%.*]] = and i32 [[V11]], 983055 +; CHECK-NEXT: [[V13:%.*]] = lshr i32 [[V11]], 8 +; CHECK-NEXT: [[V14:%.*]] = and i32 [[V13]], 983055 +; CHECK-NEXT: [[V15:%.*]] = add nuw nsw i32 [[V12]], [[V14]] +; CHECK-NEXT: [[V16:%.*]] = and i32 [[V15]], 31 +; CHECK-NEXT: [[V17:%.*]] = lshr i32 [[V15]], 16 +; CHECK-NEXT: [[V18:%.*]] = add nuw nsw i32 [[V16]], [[V17]] +; CHECK-NEXT: ret i32 [[V18]] +; +b0: + %v0 = and i32 %a0, 1431655765 + %v1 = lshr i32 %a0, 1 + %v2 = and i32 %v1, 1431655765 + %v3 = add nuw i32 %v0, %v2 + %v4 = and i32 %v3, 858993459 + %v5 = lshr i32 %v3, 2 + %v6 = and i32 %v5, 858993459 + %v7 = add nuw nsw i32 %v4, %v6 + %v8 = and i32 %v7, 117901063 + %v9 = lshr i32 %v7, 4 + %v10 = and i32 %v9, 117901063 + %v11 = add nuw nsw i32 %v8, %v10 + %v12 = and i32 %v11, 983055 + %v13 = lshr i32 %v11, 8 + %v14 = and i32 %v13, 983055 + %v15 = add nuw nsw i32 %v12, %v14 + %v16 = and i32 %v15, 31 + %v17 = lshr i32 %v15, 16 + %v18 = add nuw nsw i32 %v16, %v17 + ret i32 %v18 +} + +attributes #0 = { norecurse nounwind readnone }