Index: llvm/include/llvm/IR/PatternMatch.h =================================================================== --- llvm/include/llvm/IR/PatternMatch.h +++ llvm/include/llvm/IR/PatternMatch.h @@ -643,11 +643,11 @@ }; /// Match a specified integer value or vector of all elements of that -// value. +/// value. struct specific_intval { - uint64_t Val; + APInt Val; - specific_intval(uint64_t V) : Val(V) {} + specific_intval(APInt V) : Val(V) {} template bool match(ITy *V) { const auto *CI = dyn_cast(V); @@ -655,13 +655,17 @@ if (const auto *C = dyn_cast(V)) CI = dyn_cast_or_null(C->getSplatValue()); - return CI && CI->getValue() == Val; + return CI && APInt::isSameValue(CI->getValue(), Val); } }; /// Match a specific integer value or vector with all elements equal to /// the value. -inline specific_intval m_SpecificInt(uint64_t V) { return specific_intval(V); } +inline specific_intval m_SpecificInt(APInt V) { return specific_intval(V); } + +inline specific_intval m_SpecificInt(uint64_t V) { + return m_SpecificInt(APInt(64, V)); +} /// Match a ConstantInt and bind to its value. This does not match /// ConstantInts wider than 64-bits. Index: llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -250,6 +250,72 @@ return true; } +// Try to recognize below function as popcount intrinsic. +// This is the "best" algorithm from +// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel +// Also used in TargetLowering::expandCTPOP(). +// +// int popcount(unsigned int i) { +// i = i - ((i >> 1) & 0x55555555); +// i = (i & 0x33333333) + ((i >> 2) & 0x33333333); +// i = ((i + (i >> 4)) & 0x0F0F0F0F); +// return (i * 0x01010101) >> 24; +// } +static bool tryToRecognizePopCount(Instruction &I) { + if (I.getOpcode() != Instruction::LShr) + return false; + + Type *Ty = I.getType(); + if (!Ty->isIntOrIntVectorTy()) + return false; + + unsigned Len = Ty->getScalarSizeInBits(); + // FIXME: fix Len == 8 and other irregular type lengths. + if (!(Len <= 128 && Len > 8 && Len % 8 == 0)) + return false; + + APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55)); + APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33)); + APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F)); + APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01)); + APInt MaskShift = APInt(Len, Len - 8); + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *MulOp0; + // Matching "(i * 0x01010101...) >> 24". + if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) && + match(Op1, m_SpecificInt(MaskShift))) { + Value *ShiftOp0; + // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)". + if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)), + m_Deferred(ShiftOp0)), + m_SpecificInt(Mask0F)))) { + Value *AndOp0; + // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)". + if (match(ShiftOp0, + m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)), + m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)), + m_SpecificInt(Mask33))))) { + Value *Root, *SubOp1; + // Matching "i - ((i >> 1) & 0x55555555...)". + if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) && + match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)), + m_SpecificInt(Mask55)))) { + LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); + IRBuilder<> Builder(&I); + Function *Func = Intrinsic::getDeclaration( + I.getModule(), Intrinsic::ctpop, I.getType()); + I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); + return true; + } + } + } + } + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -268,6 +334,7 @@ for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedRotateToFunnelShift(I); + MadeChange |= tryToRecognizePopCount(I); } } Index: llvm/test/Transforms/AggressiveInstCombine/popcount.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/AggressiveInstCombine/popcount.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -O3 < %s -instcombine -S | FileCheck %s + +;int popcount8(unsigned char i) { +; i = i - ((i >> 1) & 0x55); +; i = (i & 0x33) + ((i >> 2) & 0x33); +; i = ((i + (i >> 4)) & 0x0F); +; return (i * 0x01010101); +;} +define signext i32 @popcount8(i8 zeroext %0) { +; CHECK-LABEL: @popcount8( +; CHECK-NEXT: [[TMP2:%.*]] = lshr i8 [[TMP0:%.*]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP2]], 85 +; CHECK-NEXT: [[TMP4:%.*]] = sub i8 [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[TMP4]], 51 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i8 [[TMP4]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[TMP6]], 51 +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i8 [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = lshr i8 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i8 [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = and i8 [[TMP10]], 15 +; CHECK-NEXT: [[TMP12:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: ret i32 [[TMP12]] +; + %2 = lshr i8 %0, 1 + %3 = and i8 %2, 85 + %4 = sub i8 %0, %3 + %5 = and i8 %4, 51 + %6 = lshr i8 %4, 2 + %7 = and i8 %6, 51 + %8 = add nuw nsw i8 %7, %5 + %9 = lshr i8 %8, 4 + %10 = add nuw nsw i8 %9, %8 + %11 = and i8 %10, 15 + %12 = zext i8 %11 to i32 + ret i32 %12 +} + +;int popcount32(unsigned i) { +; i = i - ((i >> 1) & 0x55555555); +; i = (i & 0x33333333) + ((i >> 2) & 0x33333333); +; i = ((i + (i >> 4)) & 0x0F0F0F0F); +; return (i * 0x01010101) >> 24; +;} +define signext i32 @popcount32(i32 zeroext %0) { +; CHECK-LABEL: @popcount32( +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[TMP0:%.*]]), !range !0 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = lshr i32 %0, 1 + %3 = and i32 %2, 1431655765 + %4 = sub i32 %0, %3 + %5 = and i32 %4, 858993459 + %6 = lshr i32 %4, 2 + %7 = and i32 %6, 858993459 + %8 = add nuw nsw i32 %7, %5 + %9 = lshr i32 %8, 4 + %10 = add nuw nsw i32 %9, %8 + %11 = and i32 %10, 252645135 + %12 = mul i32 %11, 16843009 + %13 = lshr i32 %12, 24 + ret i32 %13 +} + +;int popcount64(unsigned long long i) { +; i = i - ((i >> 1) & 0x5555555555555555); +; i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333); +; i = ((i + (i >> 4)) & 0x0F0F0F0F0F0F0F0F); +; return (i * 0x0101010101010101) >> 56; +;} +define signext i32 @popcount64(i64 %0) { +; CHECK-LABEL: @popcount64( +; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP0:%.*]]), !range !1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: ret i32 [[TMP3]] +; + %2 = lshr i64 %0, 1 + %3 = and i64 %2, 6148914691236517205 + %4 = sub i64 %0, %3 + %5 = and i64 %4, 3689348814741910323 + %6 = lshr i64 %4, 2 + %7 = and i64 %6, 3689348814741910323 + %8 = add nuw nsw i64 %7, %5 + %9 = lshr i64 %8, 4 + %10 = add nuw nsw i64 %9, %8 + %11 = and i64 %10, 1085102592571150095 + %12 = mul i64 %11, 72340172838076673 + %13 = lshr i64 %12, 56 + %14 = trunc i64 %13 to i32 + ret i32 %14 +} + +;int popcount128(__uint128_t i) { +; __uint128_t x = 0x5555555555555555; +; x <<= 64; +; x |= 0x5555555555555555; +; __uint128_t y = 0x3333333333333333; +; y <<= 64; +; y |= 0x3333333333333333; +; __uint128_t z = 0x0f0f0f0f0f0f0f0f; +; z <<= 64; +; z |= 0x0f0f0f0f0f0f0f0f; +; __uint128_t a = 0x0101010101010101; +; a <<= 64; +; a |= 0x0101010101010101; +; unsigned mask = 120; +; i = i - ((i >> 1) & x); +; i = (i & y) + ((i >> 2) & y); +; i = ((i + (i >> 4)) & z); +; return (i * a) >> mask; +;} +define signext i32 @popcount128(i128 %0) { +; CHECK-LABEL: @popcount128( +; CHECK-NEXT: [[TMP2:%.*]] = tail call i128 @llvm.ctpop.i128(i128 [[TMP0:%.*]]), !range !2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i32 +; CHECK-NEXT: ret i32 [[TMP3]] +; + %2 = lshr i128 %0, 1 + %3 = and i128 %2, 113427455640312821154458202477256070485 + %4 = sub i128 %0, %3 + %5 = and i128 %4, 68056473384187692692674921486353642291 + %6 = lshr i128 %4, 2 + %7 = and i128 %6, 68056473384187692692674921486353642291 + %8 = add nuw nsw i128 %7, %5 + %9 = lshr i128 %8, 4 + %10 = add nuw nsw i128 %9, %8 + %11 = and i128 %10, 20016609818878733144904388672456953615 + %12 = mul i128 %11, 1334440654591915542993625911497130241 + %13 = lshr i128 %12, 120 + %14 = trunc i128 %13 to i32 + ret i32 %14 +} + +;vector unsigned char popcount16(vector unsigned char i) +;{ +; i = i - ((i>> 1) & 0x55); +; i = (i & 0x33) + ((i >> 2) & 0x33); +; i = ((i + (i >> 4)) & 0x0F); +; return (i * 0x01); +;} +define <16 x i8> @popcount8vec(<16 x i8> %0) { +; CHECK-LABEL: @popcount8vec( +; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = and <16 x i8> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = sub <16 x i8> [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i8> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = lshr <16 x i8> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = and <16 x i8> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <16 x i8> [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw <16 x i8> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i8> [[TMP10]], +; CHECK-NEXT: ret <16 x i8> [[TMP11]] +; + %2 = lshr <16 x i8> %0, + %3 = and <16 x i8> %2, + %4 = sub <16 x i8> %0, %3 + %5 = and <16 x i8> %4, + %6 = lshr <16 x i8> %4, + %7 = and <16 x i8> %6, + %8 = add nuw nsw <16 x i8> %7, %5 + %9 = lshr <16 x i8> %8, + %10 = add nuw nsw <16 x i8> %9, %8 + %11 = and <16 x i8> %10, + ret <16 x i8> %11 +} + +;vector unsigned int popcount32vec(vector unsigned int i) +;{ +; i = i - ((i>> 1) & 0x55555555); +; i = (i & 0x33333333) + ((i >> 2) & 0x33333333); +; i = ((i + (i >> 4)) & 0x0F0F0F0F); +; return (i * 0x01010101) >> 24; +;} +define <4 x i32> @popcount32vec(<4 x i32> %0) { +; CHECK-LABEL: @popcount32vec( +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP0:%.*]]) +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %2 = lshr <4 x i32> %0, + %3 = and <4 x i32> %2, + %4 = sub <4 x i32> %0, %3 + %5 = and <4 x i32> %4, + %6 = lshr <4 x i32> %4, + %7 = and <4 x i32> %6, + %8 = add nuw nsw <4 x i32> %7, %5 + %9 = lshr <4 x i32> %8, + %10 = add nuw nsw <4 x i32> %9, %8 + %11 = and <4 x i32> %10, + %12 = mul <4 x i32> %11, + %13 = lshr <4 x i32> %12, + ret <4 x i32> %13 +}