Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1047,6 +1047,133 @@ return nullptr; } +// Check if BI matches a ctpop calculation pattern for a value of width BW +// bits. If so, return the argument V such that ctpop(V) would be a candidate +// for replacing BI. If BW is less than the bit-width of the type of V, then +// BI == ctpop(V) if the bits of V beyond BW are zero. +// The matched pattern is: +// x0 := (V & 0x55..5) + ((V>>1) & 0x55..5) +// x1 := (x0 & 0x33..3) + ((x0>>2) & 0x33..3) +// ... +// xn := (xn-1 & 0x00..0FF..F) + ((xn-1>>S/2) & 0x00..0FF..F) +// where xn is the candidate for ctpop(V). +static Value *matchCtpopW(BinaryOperator *BI, unsigned BW) { + auto matchStep = [] (Value *V, unsigned S, APInt M, bool ShiftAlone) + -> Value* { + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(V, m_Add(m_Value(Op0), m_Value(Op1)))) + return nullptr; + + auto matchAndShift = [S,M,ShiftAlone] (Value *V0, Value *V1) -> Value* { + Value *V = nullptr; + const APInt *P = &M; + auto Mask = m_APInt(P); + auto Shift = m_SpecificInt(S); + + if (!match(V0, m_And(m_Value(V), Mask))) + return nullptr; + if (ShiftAlone) { + if (!match(V1, m_LShr(m_Specific(V), Shift))) + return nullptr; + } else { + if (!match(V1, m_And(m_LShr(m_Specific(V), Shift), Mask))) + return nullptr; + } + return V; + }; + + if (Value *T = matchAndShift(Op0, Op1)) + return T; + if (Value *T = matchAndShift(Op1, Op0)) + return T; + return nullptr; + }; + + // Generate the bitmask for the & operation. BW is the bit-width of the + // entire mask. The masks are: + // 0b01010101..01010101 0x55..55 1 bit every 2 bits + // 0b00110011..00110011 0x33..35 2 bits every 4 bits + // 0b00000111..00000111 0x07..07 3 bits every 8 bits + // ... ... logS bits every S bits + // Normally the masks would be 01010101, 00110011, 00001111, i.e. the + // number of contiguous 1 bits in each group would be twice the number + // in the previous mask, but by the time this code runs, the "demanded" + // bits have been optimized to only require one more 1 bit in each + // subsequent mask. This function generates the post-optimized masks. + auto getMask = [] (unsigned S, unsigned BW) -> APInt { + assert(isPowerOf2_32(S)); + APInt M(BW, S-1); + APInt T(BW, 0); + while (M != 0) { + T |= M; + M <<= S; + } + return T; + }; + + Value *V = BI; + bool SA = true; + unsigned N = BW; + while (N > 1) { + unsigned S = N/2; + V = matchStep(V, S, getMask(N, BW), SA); + if (!V) + return nullptr; + N = S; + SA = false; + } + + return V; +} + +static Value *optimizeToCtpop(BinaryOperator *BI, + InstCombiner::BuilderTy *Builder) { + IntegerType *Ty = dyn_cast(BI->getType()); + if (!Ty) + return nullptr; + + // Take the first shift amount feeding the add, and assume this is the + // last shift in the popcnt computation. + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(BI, m_Add(m_Value(Op0), m_Value(Op1)))) + return nullptr; + + // Shift by half-width. + uint64_t SH = 0; + if (!match(Op0, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) && + !match(Op1, m_And(m_Value(), m_LShr(m_Value(), m_ConstantInt(SH)))) && + !match(Op0, m_LShr(m_Value(), m_ConstantInt(SH))) && + !match(Op1, m_LShr(m_Value(), m_ConstantInt(SH)))) + return nullptr; + + if (SH < 4 || !isPowerOf2_64(SH)) + return nullptr; + + Value *V = matchCtpopW(BI, 2*SH); + if (!V) + return nullptr; + + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); + unsigned TW = Ty->getBitWidth(), BW = 2*SH; + if (BW < TW) { + // BW is the bit width of the expression whose population count is + // being calculated. TW is the bit width of the type associated with + // that expression. Usually they are the same, but for ctpop8 the + // type may be "unsigned", i.e. 32-bit, while the ctpop8 would only + // consider the low 8 bits. In that case BW=8 and TW=32. + APInt K0(TW, 0), K1(TW, 0); + computeKnownBits(V, K0, K1, M->getDataLayout()); + APInt Need0 = APInt::getBitsSet(TW, BW, TW); + if ((K0 & Need0) != Need0) + return nullptr; + } + + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, {V->getType()}); + CallInst *CI = Builder->CreateCall(Func, {V}); + CI->setDebugLoc(BI->getDebugLoc()); + return CI; +} + Instruction *InstCombiner::visitAdd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); @@ -1295,6 +1422,9 @@ } } + if (Value *V = optimizeToCtpop(&I, Builder)) + return ReplaceInstUsesWith(I, V); + // TODO(jingyue): Consider WillNotOverflowSignedAdd and // WillNotOverflowUnsignedAdd to reduce the number of invocations of // computeKnownBits. @@ -1489,9 +1619,71 @@ return Builder->CreateIntCast(Result, Ty, true); } +static Value *optimizeToCtlz(BinaryOperator *BI, + InstCombiner::BuilderTy *Builder) { + // Let bw = bitwidth(n), + // convert + // n = n | (n>>1) + // n = n | (n>>2) + // n = n | (n>>4) + // ... + // n = n | (n>>bw/2) + // bw - ctpop(n) + // to + // ctlz(n). + // This code expects that the ctpop intrinsic has already been generated. + + uint64_t BW = 0; + if (!match(BI, m_Sub(m_ConstantInt(BW), m_Intrinsic()))) + return nullptr; + // Get the argument of the ctpop. + Value *V = cast(BI->getOperand(1))->getOperand(0); + + // The argument to ctpop can be zero-extended in some cases. It is safe + // to ignore the zext. + if (auto *Z = dyn_cast(V)) + V = Z->getOperand(0); + + IntegerType *Ty = cast(V->getType()); + if (BW < Ty->getBitWidth()) + return nullptr; + + auto matchOrShift = [] (Value *V, unsigned S) -> Value* { + Value *Op0 = nullptr, *Op1 = nullptr; + if (!match(V, m_Or(m_Value(Op0), m_Value(Op1)))) + return nullptr; + if (match(Op0, m_LShr(m_Specific(Op1), m_SpecificInt(S)))) + return Op1; + if (match(Op1, m_LShr(m_Specific(Op0), m_SpecificInt(S)))) + return Op0; + return nullptr; + }; + + unsigned N = BW; + while (N > 1) { + N /= 2; + V = matchOrShift(V, N); + if (!V) + return nullptr; + } + + // The value of BW is the one that determines the type of ctlz's argument. + if (BW > Ty->getBitWidth()) { + IntegerType *ATy = IntegerType::get(BI->getContext(), BW); + V = Builder->CreateZExt(V, ATy); + } + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, {V->getType()}); + Value *False = ConstantInt::getFalse(BI->getContext()); + CallInst *CI = Builder->CreateCall(Func, {V, False}); + CI->setDebugLoc(BI->getDebugLoc()); + if (BI->getType() != CI->getType()) + return Builder->CreateZExt(CI, BI->getType()); + return CI; +} + Instruction *InstCombiner::visitSub(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (Value *V = SimplifyVectorOp(I)) return ReplaceInstUsesWith(I, V); @@ -1675,6 +1867,9 @@ if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) return ReplaceInstUsesWith(I, Res); + if (Value *V = optimizeToCtlz(&I, Builder)) + return ReplaceInstUsesWith(I, V); + bool Changed = false; if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, I)) { Changed = true; Index: test/Transforms/InstCombine/ctlz-match.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/ctlz-match.ll @@ -0,0 +1,172 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +; unsigned ctlz16(unsigned short t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555); +; unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333); +; unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F); +; unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF); +; return 16-t4; +; } +; +; CHECK: define i32 @ctlz16 +; CHECK: [[V0:%[a-zA-Z0-9_]+]] = call i16 @llvm.ctlz.i16(i16 %t0, i1 false) +; CHECK: zext i16 [[V0]] to i32 +define i32 @ctlz16(i16 zeroext %t0) #0 { +entry: + %conv = zext i16 %t0 to i32 + %shr = ashr i32 %conv, 1 + %or = or i32 %conv, %shr + %conv2 = trunc i32 %or to i16 + %conv3 = zext i16 %conv2 to i32 + %shr5 = ashr i32 %conv3, 2 + %or6 = or i32 %conv3, %shr5 + %conv7 = trunc i32 %or6 to i16 + %conv8 = zext i16 %conv7 to i32 + %shr10 = ashr i32 %conv8, 4 + %or11 = or i32 %conv8, %shr10 + %conv12 = trunc i32 %or11 to i16 + %conv13 = zext i16 %conv12 to i32 + %shr15 = ashr i32 %conv13, 8 + %or16 = or i32 %conv13, %shr15 + %conv17 = trunc i32 %or16 to i16 + %conv18 = zext i16 %conv17 to i32 + %and = and i32 %conv18, 21845 + %shr20 = ashr i32 %conv18, 1 + %and21 = and i32 %shr20, 21845 + %add = add nsw i32 %and, %and21 + %and22 = and i32 %add, 13107 + %shr23 = lshr i32 %add, 2 + %and24 = and i32 %shr23, 13107 + %add25 = add i32 %and22, %and24 + %and26 = and i32 %add25, 3855 + %shr27 = lshr i32 %add25, 4 + %and28 = and i32 %shr27, 3855 + %add29 = add i32 %and26, %and28 + %and30 = and i32 %add29, 255 + %shr31 = lshr i32 %add29, 8 + %and32 = and i32 %shr31, 255 + %add33 = add i32 %and30, %and32 + %sub = sub i32 16, %add33 + ret i32 %sub +} + + +; unsigned ctlz32(unsigned t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; t0 = t0 | (t0>>16); +; unsigned t1 = (t0 & 0x55555555) + ((t0>>1) & 0x55555555); +; unsigned t2 = (t1 & 0x33333333) + ((t1>>2) & 0x33333333); +; unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4) & 0x0F0F0F0F); +; unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8) & 0x00FF00FF); +; unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF); +; return 32-t5; +; } +; +; CHECK: define i32 @ctlz32 +; CHECK: @llvm.ctlz.i32(i32 %t0, i1 false) +define i32 @ctlz32(i32 %t0) #0 { +entry: + %shr = lshr i32 %t0, 1 + %or = or i32 %t0, %shr + %shr1 = lshr i32 %or, 2 + %or2 = or i32 %or, %shr1 + %shr3 = lshr i32 %or2, 4 + %or4 = or i32 %or2, %shr3 + %shr5 = lshr i32 %or4, 8 + %or6 = or i32 %or4, %shr5 + %shr7 = lshr i32 %or6, 16 + %or8 = or i32 %or6, %shr7 + %and = and i32 %or8, 1431655765 + %shr9 = lshr i32 %or8, 1 + %and10 = and i32 %shr9, 1431655765 + %add = add i32 %and, %and10 + %and11 = and i32 %add, 858993459 + %shr12 = lshr i32 %add, 2 + %and13 = and i32 %shr12, 858993459 + %add14 = add i32 %and11, %and13 + %and15 = and i32 %add14, 252645135 + %shr16 = lshr i32 %add14, 4 + %and17 = and i32 %shr16, 252645135 + %add18 = add i32 %and15, %and17 + %and19 = and i32 %add18, 16711935 + %shr20 = lshr i32 %add18, 8 + %and21 = and i32 %shr20, 16711935 + %add22 = add i32 %and19, %and21 + %and23 = and i32 %add22, 65535 + %shr24 = lshr i32 %add22, 16 + %and25 = and i32 %shr24, 65535 + %add26 = add i32 %and23, %and25 + %sub = sub i32 32, %add26 + ret i32 %sub +} + + +; typedef unsigned long long u64_t; +; u64_t ctlz64(u64_t t0) { +; t0 = t0 | (t0>>1); +; t0 = t0 | (t0>>2); +; t0 = t0 | (t0>>4); +; t0 = t0 | (t0>>8); +; t0 = t0 | (t0>>16); +; t0 = t0 | (t0>>32); +; u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1) & 0x5555555555555555LL); +; u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2) & 0x3333333333333333LL); +; u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4) & 0x0F0F0F0F0F0F0F0FLL); +; u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8) & 0x00FF00FF00FF00FFLL); +; u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL); +; u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL); +; return 64-t6; +; } +; +; CHECK: define i64 @ctlz64 +; CHECK: @llvm.ctlz.i64(i64 %t0, i1 false) +define i64 @ctlz64(i64 %t0) #0 { +entry: + %shr = lshr i64 %t0, 1 + %or = or i64 %t0, %shr + %shr1 = lshr i64 %or, 2 + %or2 = or i64 %or, %shr1 + %shr3 = lshr i64 %or2, 4 + %or4 = or i64 %or2, %shr3 + %shr5 = lshr i64 %or4, 8 + %or6 = or i64 %or4, %shr5 + %shr7 = lshr i64 %or6, 16 + %or8 = or i64 %or6, %shr7 + %shr9 = lshr i64 %or8, 32 + %or10 = or i64 %or8, %shr9 + %and = and i64 %or10, 6148914691236517205 + %shr11 = lshr i64 %or10, 1 + %and12 = and i64 %shr11, 6148914691236517205 + %add = add i64 %and, %and12 + %and13 = and i64 %add, 3689348814741910323 + %shr14 = lshr i64 %add, 2 + %and15 = and i64 %shr14, 3689348814741910323 + %add16 = add i64 %and13, %and15 + %and17 = and i64 %add16, 1085102592571150095 + %shr18 = lshr i64 %add16, 4 + %and19 = and i64 %shr18, 1085102592571150095 + %add20 = add i64 %and17, %and19 + %and21 = and i64 %add20, 71777214294589695 + %shr22 = lshr i64 %add20, 8 + %and23 = and i64 %shr22, 71777214294589695 + %add24 = add i64 %and21, %and23 + %and25 = and i64 %add24, 281470681808895 + %shr26 = lshr i64 %add24, 16 + %and27 = and i64 %shr26, 281470681808895 + %add28 = add i64 %and25, %and27 + %and29 = and i64 %add28, 4294967295 + %shr30 = lshr i64 %add28, 32 + %and31 = and i64 %shr30, 4294967295 + %add32 = add i64 %and29, %and31 + %sub = sub i64 64, %add32 + ret i64 %sub +} + +attributes #0 = { nounwind } Index: test/Transforms/InstCombine/ctpop-match.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/ctpop-match.ll @@ -0,0 +1,145 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +; unsigned pop8(unsigned char t0) { +; unsigned t1 = (t0 & 0x55) + ((t0>>1) & 0x55); +; unsigned t2 = (t1 & 0x33) + ((t1>>2) & 0x33); +; unsigned t3 = (t2 & 0x0F) + ((t2>>4) & 0x0F); +; return t3; +; } +; +; CHECK: define i32 @pop8 +; CHECK: [[ARG8:%[a-zA-Z0-9_]+]] = zext i8 %t0 to i32 +; CHECK: @llvm.ctpop.i32(i32 [[ARG8]]) +define i32 @pop8(i8 zeroext %t0) #0 { +entry: + %conv = zext i8 %t0 to i32 + %and = and i32 %conv, 85 + %shr = ashr i32 %conv, 1 + %and2 = and i32 %shr, 85 + %add = add nsw i32 %and, %and2 + %and3 = and i32 %add, 51 + %shr4 = lshr i32 %add, 2 + %and5 = and i32 %shr4, 51 + %add6 = add i32 %and3, %and5 + %and7 = and i32 %add6, 15 + %shr8 = lshr i32 %add6, 4 + %and9 = and i32 %shr8, 15 + %add10 = add i32 %and7, %and9 + ret i32 %add10 +} + + +; unsigned pop16(unsigned short t0) { +; unsigned t1 = (t0 & 0x5555) + ((t0>>1) & 0x5555); +; unsigned t2 = (t1 & 0x3333) + ((t1>>2) & 0x3333); +; unsigned t3 = (t2 & 0x0F0F) + ((t2>>4) & 0x0F0F); +; unsigned t4 = (t3 & 0x00FF) + ((t3>>8) & 0x00FF); +; return t4; +; } +; +; CHECK: define i32 @pop16 +; CHECK: [[ARG16:%[a-zA-Z0-9_]+]] = zext i16 %t0 to i32 +; CHECK: @llvm.ctpop.i32(i32 [[ARG16]]) +define i32 @pop16(i16 zeroext %t0) #0 { +entry: + %conv = zext i16 %t0 to i32 + %and = and i32 %conv, 21845 + %shr = ashr i32 %conv, 1 + %and2 = and i32 %shr, 21845 + %add = add nsw i32 %and, %and2 + %and3 = and i32 %add, 13107 + %shr4 = lshr i32 %add, 2 + %and5 = and i32 %shr4, 13107 + %add6 = add i32 %and3, %and5 + %and7 = and i32 %add6, 3855 + %shr8 = lshr i32 %add6, 4 + %and9 = and i32 %shr8, 3855 + %add10 = add i32 %and7, %and9 + %and11 = and i32 %add10, 255 + %shr12 = lshr i32 %add10, 8 + %and13 = and i32 %shr12, 255 + %add14 = add i32 %and11, %and13 + ret i32 %add14 +} + + +; unsigned pop32(unsigned t0) { +; unsigned t1 = (t0 & 0x55555555) + ((t0>>1) & 0x55555555); +; unsigned t2 = (t1 & 0x33333333) + ((t1>>2) & 0x33333333); +; unsigned t3 = (t2 & 0x0F0F0F0F) + ((t2>>4) & 0x0F0F0F0F); +; unsigned t4 = (t3 & 0x00FF00FF) + ((t3>>8) & 0x00FF00FF); +; unsigned t5 = (t4 & 0x0000FFFF) + ((t4>>16) & 0x0000FFFF); +; return t5; +; } +; +; CHECK: define i32 @pop32 +; CHECK: @llvm.ctpop.i32(i32 %t0) +define i32 @pop32(i32 %t0) #0 { +entry: + %and = and i32 %t0, 1431655765 + %shr = lshr i32 %t0, 1 + %and1 = and i32 %shr, 1431655765 + %add = add i32 %and, %and1 + %and2 = and i32 %add, 858993459 + %shr3 = lshr i32 %add, 2 + %and4 = and i32 %shr3, 858993459 + %add5 = add i32 %and2, %and4 + %and6 = and i32 %add5, 252645135 + %shr7 = lshr i32 %add5, 4 + %and8 = and i32 %shr7, 252645135 + %add9 = add i32 %and6, %and8 + %and10 = and i32 %add9, 16711935 + %shr11 = lshr i32 %add9, 8 + %and12 = and i32 %shr11, 16711935 + %add13 = add i32 %and10, %and12 + %and14 = and i32 %add13, 65535 + %shr15 = lshr i32 %add13, 16 + %and16 = and i32 %shr15, 65535 + %add17 = add i32 %and14, %and16 + ret i32 %add17 +} + + +; typedef unsigned long long u64_t; +; u64_t pop64(u64_t t0) { +; u64_t t1 = (t0 & 0x5555555555555555LL) + ((t0>>1) & 0x5555555555555555LL); +; u64_t t2 = (t1 & 0x3333333333333333LL) + ((t1>>2) & 0x3333333333333333LL); +; u64_t t3 = (t2 & 0x0F0F0F0F0F0F0F0FLL) + ((t2>>4) & 0x0F0F0F0F0F0F0F0FLL); +; u64_t t4 = (t3 & 0x00FF00FF00FF00FFLL) + ((t3>>8) & 0x00FF00FF00FF00FFLL); +; u64_t t5 = (t4 & 0x0000FFFF0000FFFFLL) + ((t4>>16) & 0x0000FFFF0000FFFFLL); +; u64_t t6 = (t5 & 0x00000000FFFFFFFFLL) + ((t5>>32) & 0x00000000FFFFFFFFLL); +; return t6; +; } +; +; CHECK: define i64 @pop64 +; CHECK: @llvm.ctpop.i64(i64 %t0) +define i64 @pop64(i64 %t0) #0 { +entry: + %and = and i64 %t0, 6148914691236517205 + %shr = lshr i64 %t0, 1 + %and1 = and i64 %shr, 6148914691236517205 + %add = add i64 %and, %and1 + %and2 = and i64 %add, 3689348814741910323 + %shr3 = lshr i64 %add, 2 + %and4 = and i64 %shr3, 3689348814741910323 + %add5 = add i64 %and2, %and4 + %and6 = and i64 %add5, 1085102592571150095 + %shr7 = lshr i64 %add5, 4 + %and8 = and i64 %shr7, 1085102592571150095 + %add9 = add i64 %and6, %and8 + %and10 = and i64 %add9, 71777214294589695 + %shr11 = lshr i64 %add9, 8 + %and12 = and i64 %shr11, 71777214294589695 + %add13 = add i64 %and10, %and12 + %and14 = and i64 %add13, 281470681808895 + %shr15 = lshr i64 %add13, 16 + %and16 = and i64 %shr15, 281470681808895 + %add17 = add i64 %and14, %and16 + %and18 = and i64 %add17, 4294967295 + %shr19 = lshr i64 %add17, 32 + %and20 = and i64 %shr19, 4294967295 + %add21 = add i64 %and18, %and20 + ret i64 %add21 +} + +attributes #0 = { nounwind }