diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1322,19 +1322,38 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { assert(II.getIntrinsicID() == Intrinsic::ctpop && "Expected ctpop intrinsic"); + Type *Ty = II.getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); Value *Op0 = II.getArgOperand(0); Value *X; + // ctpop(bitreverse(x)) -> ctpop(x) // ctpop(bswap(x)) -> ctpop(x) if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) return IC.replaceOperand(II, 0, X); + // ctpop(x | -x) -> bitwidth - cttz(x, false) + if (match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { + Function *F = + Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); + auto *Bw = Constant::getIntegerValue(Ty, APInt(BitWidth, BitWidth)); + return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); + } + + // ctpop(~x & (x - 1)) -> cttz(x, false) + if (match(Op0, + m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { + Function *F = + Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + return CallInst::Create(F, {X, IC.Builder.getFalse()}); + } + // FIXME: Try to simplify vectors of integers. - auto *IT = dyn_cast(Op0->getType()); + auto *IT = dyn_cast(Ty); if (!IT) return nullptr; - unsigned BitWidth = IT->getBitWidth(); KnownBits Known(BitWidth); IC.computeKnownBits(Op0, Known, 0, &II); diff --git a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll --- a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll +++ b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll @@ -8,14 +8,11 @@ ; __builtin_popcount(i | -i) -> 32 - __builtin_cttz(i, false) define i32 @ctpop1(i32 %0) { ; CHECK-LABEL: @ctpop1( -; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[TMP3]]), !range !0 -; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw i32 32, [[TMP4]] -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0:%.*]], i1 false), !range !0 +; CHECK-NEXT: ret i32 [[TMP2]] ; %2 = sub i32 0, %0 - %3 = or i32 %2, %0 + %3 = or i32 %0, %2 %4 = tail call i32 @llvm.ctpop.i32(i32 %3) %5 = sub i32 32, %4 ret i32 %5 @@ -23,31 +20,25 @@ define <2 x i32> @ctpop1v(<2 x i32> %0) { ; CHECK-LABEL: @ctpop1v( -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> , [[TMP4]] -; CHECK-NEXT: ret <2 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP0:%.*]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[TMP2]] +; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %2 = sub <2 x i32> zeroinitializer, %0 %3 = or <2 x i32> %2, %0 %4 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %3) - %5 = sub <2 x i32> , %4 - ret <2 x i32> %5 + ret <2 x i32> %4 } ; PR43513 ; __builtin_popcount(~i & (i-1)) -> __builtin_cttz(i, false) define i32 @ctpop2(i32 %0) { ; CHECK-LABEL: @ctpop2( -; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP0:%.*]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], -1 -; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[TMP4]]), !range !0 -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0:%.*]], i1 false), !range !0 +; CHECK-NEXT: ret i32 [[TMP2]] ; %2 = xor i32 %0, -1 - %3 = add i32 %0, -1 + %3 = sub i32 %0, 1 %4 = and i32 %3, %2 %5 = tail call i32 @llvm.ctpop.i32(i32 %4) ret i32 %5 @@ -55,15 +46,12 @@ define <2 x i32> @ctpop2v(<2 x i32> %0) { ; CHECK-LABEL: @ctpop2v( -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[TMP0:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i32> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[TMP4]]) -; CHECK-NEXT: ret <2 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP0:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %2 = xor <2 x i32> %0, %3 = add <2 x i32> %0, - %4 = and <2 x i32> %3, %2 + %4 = and <2 x i32> %2, %3 %5 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %4) ret <2 x i32> %5 }