Index: include/llvm/Transforms/Utils/Local.h =================================================================== --- include/llvm/Transforms/Utils/Local.h +++ include/llvm/Transforms/Utils/Local.h @@ -172,6 +172,7 @@ bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr); +bool EliminateRedundantMasks(BasicBlock &BB); //===----------------------------------------------------------------------===// // Control Flow Graph Restructuring. // Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -165,6 +165,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) { bool MadeChange = false; for (BasicBlock &BB : F) { + MadeChange = EliminateRedundantMasks(BB); // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) continue; Index: lib/Transforms/Utils/Local.cpp =================================================================== --- lib/Transforms/Utils/Local.cpp +++ lib/Transforms/Utils/Local.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/TinyPtrVector.h" @@ -611,6 +612,228 @@ return MadeChange; } +/// Eliminate redundant masks +/// let the pattern be: +/// X1 = and(X, c1) +/// X2 = and(shift(X, c3), c2) +/// where c1, c2, c3 are constants +/// We try to infer if we can just reuse X1 to compute X2 +/// obtaining X2 = shift(X1, c2) +static cl::opt EliminateRedundantMasksSearchLimit( + "erm-search-limit", cl::desc("Maximum instructions to compare a mask to."), + cl::init(6), cl::Hidden); + +bool llvm::EliminateRedundantMasks(BasicBlock &BB) { + struct APIntLT { + bool operator()(APInt a, APInt b) { return a.ult(b); } + }; + + typedef std::map ANDS; + + bool HasChanges = false; + // Cache for every masked value the bits that we don't know to be zero + std::map ValuesNonZeroCache; + // A given mask should be extracted from a value once. Cache tem as to + // detect duplicated + std::map Ands; + // Duplicated masking operations are removed at the 2nd stage + SmallSet ToRemove; + + // 1st step: Cache all (and X, c1) values, using key (X, "effective c1") + // If we detect duplicated AND operations, erase those that come after. + for (Instruction &I : BB) { + const Type *T = I.getType(); + // At the moment we limit it to work in integer types, sized <= 64bits + if (!(T->isIntegerTy() && T->isSized()) || T->isVectorTy()) + continue; + + const APInt *MaskR; + BinaryOperator *AndOp; + Value *X; + + if (!match(&I, + m_CombineAnd(m_BinOp(AndOp), m_And(m_Value(X), m_APInt(MaskR))))) + continue; + + APInt Mask = *MaskR; + if (ValuesNonZeroCache.find(X) == ValuesNonZeroCache.end()) { + KnownBits KB = + computeKnownBits(X, BB.getParent()->getParent()->getDataLayout()); + APInt NotZero = ~KB.Zero; + + Mask &= NotZero; + ValuesNonZeroCache[X] = NotZero; + } + + if (BinaryOperator *ReferenceAnd = Ands[X][Mask]) { + // Replace all uses of this found and with the already cached one + LLVM_DEBUG(dbgs() << "Replacing: "; AndOp->dump(); dbgs() << " With: "; + ReferenceAnd->dump()); + AndOp->replaceAllUsesWith(ReferenceAnd); + ToRemove.insert(AndOp); + HasChanges = true; + continue; + } + + Ands[X][Mask] = AndOp; + LLVM_DEBUG(dbgs() << "Value: "; X->dump(); dbgs() << "\t is masked by "; + AndOp->dump(); + dbgs() << "\tThe mask: 0x" << Mask.toString(16, false) << '\n'); + } + + // 2nd stage: We remove the dead masking operations + for (Instruction *I : ToRemove) { + I->removeFromParent(); + I->deleteValue(); + } + + ToRemove.clear(); + + // 3rd stage: We check backwards if masking of shift operations also extract + // the same mask, replacing their operand for the existing mask. + + // Once we decided to reuse a given value, we must ensure all (and) ops + // dominate their uses. ToExecuteBefore holds the first user for an (and). + std::map ToExecuteBefore; + + for (auto II = BB.rbegin(); II != BB.rend(); II++) { + Instruction *I = &*II; + const Type *T = I->getType(); + if (!(T->isIntegerTy() && T->isSized()) || T->isVectorTy()) + continue; + + ConstantInt *ShiftAmtC; + const APInt *SMask; + BinaryOperator *Shift, *DeadAnd; + Value *X; + + if (!match(I, m_CombineAnd( + m_BinOp(DeadAnd), + m_And(m_CombineAnd( + m_BinOp(Shift), + m_OneUse(m_Shift(m_Value(X), + m_ConstantInt(ShiftAmtC)))), + m_APInt(SMask))))) + continue; + LLVM_DEBUG(dbgs() << "Matched: "; DeadAnd->dump(); Shift->dump();); + + auto ValueAndsI = Ands.find(X); + if (ValueAndsI == Ands.end()) + continue; + + const APInt *ShiftAmt = &ShiftAmtC->getValue(); + const unsigned VShiftAmt = ShiftAmt->getZExtValue(); + const bool SafeAShr = SMask->countLeadingOnes() < VShiftAmt; + const bool AShrToLShr = ValuesNonZeroCache[X].isSignBitClear() || + SMask->countLeadingZeros() >= VShiftAmt; + const auto Opcode = Shift->getOpcode(); + // We try to find an direct match of the masked value + BinaryOperator *RAnd = nullptr; + if (Opcode == Instruction::Shl) { + APInt EffectiveMask = ValuesNonZeroCache[X] & SMask->lshr(VShiftAmt); + LLVM_DEBUG(dbgs() << "\tThe mask: 0x" << EffectiveMask.toString(16, false) + << '\n'); + + RAnd = ValueAndsI->second[EffectiveMask]; + } else if (Opcode == Instruction::LShr || SafeAShr || AShrToLShr) { + APInt EffectiveMask = ValuesNonZeroCache[X] & SMask->shl(VShiftAmt); + LLVM_DEBUG(dbgs() << "\tThe mask: 0x" << EffectiveMask.toString(16, false) + << "\nNon Zero mask: 0x" + << ValuesNonZeroCache[X].toString(16, false) << '\n'); + + RAnd = ValueAndsI->second[EffectiveMask]; + } + + // If we can't match it, try to explore the existing masks to see if any of + // them suits our required bits. Limited to search up to + // n masks = erm-search-limit[default=4]. + if (!RAnd) { + unsigned C = EliminateRedundantMasksSearchLimit; + for (auto ToTest = ValueAndsI->second.begin(), + End = ValueAndsI->second.end(); + ToTest != End && C != 0; C++, ToTest++) { + if (Shift->getOpcode() == Instruction::Shl) { + if (ToTest->first.shl(VShiftAmt) == *SMask) { + RAnd = ToTest->second; + break; + } + } else if (Shift->getOpcode() == Instruction::LShr) { + if (ToTest->first.lshr(VShiftAmt) == *SMask) { + RAnd = ToTest->second; + break; + } + } + // Instruction::AShr + else if (ToTest->first.ashr(VShiftAmt) == *SMask || + (AShrToLShr && ToTest->first.lshr(VShiftAmt) == *SMask)) { + RAnd = ToTest->second; + break; + } + } + } + + if (!RAnd) + continue; + + LLVM_DEBUG(dbgs() << "Reusing result of: "; RAnd->dump(); + dbgs() << " To compute: "; Shift->dump(); + dbgs() << " Eliminating: "; DeadAnd->dump()); + HasChanges = true; + if (Opcode == Instruction::AShr && AShrToLShr) { + BinaryOperator *newShift = BinaryOperator::CreateLShr(RAnd, ShiftAmtC); + newShift->insertBefore(Shift); + Shift->replaceAllUsesWith(newShift); + Shift->removeFromParent(); + Shift->deleteValue(); + Shift = newShift; + } else + Shift->setOperand(0, RAnd); +#ifndef NDEBUG + KnownBits Before = + computeKnownBits(DeadAnd, BB.getParent()->getParent()->getDataLayout()); + KnownBits After = + computeKnownBits(Shift, BB.getParent()->getParent()->getDataLayout()); + (void)Before; + (void)After; + LLVM_DEBUG(dbgs() << ""; + + if (!(Before.Zero == After.Zero && Before.One == After.One)) { + dbgs() << "Before zero: 0x" << Before.Zero.toString(16, false) + << "\nAfter zero: 0x" << After.Zero.toString(16, false) + << "\nBefore one: 0x" << Before.One.toString(16, false) + << "\nAfter one: 0x" << After.One.toString(16, false); + BB.dump(); + errs() << "This transformation is invalid!"; + }); + assert(Before.Zero == After.Zero && Before.One == After.One); +#endif + DeadAnd->replaceAllUsesWith(Shift); + ValuesNonZeroCache.erase(DeadAnd); + Ands.erase(DeadAnd); + ToRemove.insert(DeadAnd); + ToExecuteBefore[RAnd] = Shift; + } + + for (Instruction *I : ToRemove) { + I->removeFromParent(); + I->deleteValue(); + } + + ToRemove.clear(); + + auto End = BB.end(); + for (auto &ProducerUser : ToExecuteBefore) { + auto Producer = ProducerUser.first->getIterator(); + auto User = ProducerUser.second->getIterator(); + while (++Producer != End && Producer != User) + ; + if (Producer == End) + ProducerUser.first->moveBefore(ProducerUser.second); + } + ToExecuteBefore.clear(); + return HasChanges; +} + //===----------------------------------------------------------------------===// // Control Flow Graph Restructuring. // Index: test/Transforms/AggressiveInstCombine/EliminateRedundantMasks.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/EliminateRedundantMasks.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aggressive-instcombine %s -o - | FileCheck %s + +; Fold redundant masking operations of shifted value +; In a case where +; x1 = a & 0xFF00 +; x2 = a >> 8 & 0xFF +; we can see x2 as: +; x2 = a >> 8 & 0xFF00 >> 8 +; that can be translated to +; x2 = (a & 0xFF00) >> 8 +; that is +; x2 = x1 >> 8 + + +define i32 @shl(i16 %a) { +; CHECK-LABEL: @shl( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[SXT]], 172 +; CHECK-NEXT: [[SH:%.*]] = shl i32 [[AND]], 8 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[AND]], [[SH]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %sxt = sext i16 %a to i32 + %sh = shl i32 %sxt, 8 + %and = and i32 %sxt, 172 + %dead = and i32 %sh, 44032 + %out = or i32 %and, %dead + ret i32 %out +} + +define i32 @lshr(i16 %a) { +; CHECK-LABEL: @lshr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[SXT]], 44032 +; CHECK-NEXT: [[SH:%.*]] = lshr i32 [[AND]], 8 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[SH]], [[AND]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %sxt = sext i16 %a to i32 + %sh = lshr i32 %sxt, 8 + %dead = and i32 %sh, 172 + %and = and i32 %sxt, 44032 + %out = or i32 %dead, %and + ret i32 %out +} + +define i32 @ashr(i16 %a) { +; CHECK-LABEL: @ashr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[EXT]], 44032 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[AND]], 8 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[AND]], [[TMP0]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %ext = sext i16 %a to i32 + %and = and i32 %ext, 44032 + %sh = ashr i32 %ext, 8 + %dead = and i32 %sh, 172 + %out = or i32 %and, %dead + ret i32 %out +} + +;Check sing bit is one, we can't ignore upper lost bits +define i32 @ashr_no_good(i16 %a) { +; CHECK-LABEL: @ashr_no_good( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[SH:%.*]] = ashr i32 [[EXT]], 8 +; CHECK-NEXT: [[NOTDEAD:%.*]] = and i32 [[SH]], 2113895935 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[EXT]], 32255 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[AND]], [[NOTDEAD]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %ext = sext i16 %a to i32 + %sh = ashr i32 %ext, 8 + %notdead = and i32 %sh, 2113895935 ; 0x7DFF7DFF + %and = and i32 %ext, 32255 ; 0x7DFF + %out = or i32 %and, %notdead + ret i32 %out +} + +define i32 @ashr2(i16 %a) { +; CHECK-LABEL: @ashr2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[EXT]], 65280 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[AND]], 8 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[AND]], [[TMP0]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %ext = sext i16 %a to i32 + %and = and i32 %ext, 65280 + %sh = ashr i32 %ext, 8 + %dead = and i32 %sh, 255 + %out = or i32 %and, %dead + ret i32 %out +} + +define i16 @ashr3(i16 %a) { +; CHECK-LABEL: @ashr3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND:%.*]] = and i16 [[A:%.*]], -256 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[AND]], 8 +; CHECK-NEXT: [[OUT:%.*]] = or i16 [[AND]], [[TMP0]] +; CHECK-NEXT: ret i16 [[OUT]] +; +entry: + %and = and i16 %a, 65280 + %sh = ashr i16 %a, 8 + %dead = and i16 %sh, 255 + %out = or i16 %and, %dead + ret i16 %out +} + +define i32 @ashr_nogood(i16 %a) { +; CHECK-LABEL: @ashr_nogood( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[EXT]], 65280 +; CHECK-NEXT: [[SH:%.*]] = ashr i32 [[EXT]], 8 +; CHECK-NEXT: [[NOTDEAD:%.*]] = and i32 [[SH]], 65535 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[AND]], [[NOTDEAD]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %ext = sext i16 %a to i32 + %and = and i32 %ext, 65280 + %sh = ashr i32 %ext, 8 + %notdead = and i32 %sh, 65535 + %out = or i32 %and, %notdead + ret i32 %out +} + +define i32 @shl_nogood(i16 %a) { +; CHECK-LABEL: @shl_nogood( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[EXT]], 172 +; CHECK-NEXT: [[SH:%.*]] = shl i32 [[EXT]], [[AND]] +; CHECK-NEXT: [[NOTDEAD:%.*]] = and i32 [[SH]], 44032 +; CHECK-NEXT: [[OUT:%.*]] = or i32 [[AND]], [[NOTDEAD]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %ext = sext i16 %a to i32 + %and = and i32 %ext, 172 + %sh = shl i32 %ext, %and + %notdead = and i32 %sh, 44032 + %out = or i32 %and, %notdead + ret i32 %out +} + +define void @foo(i16* %a) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load i16, i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[LD]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[CONV]], 65280 +; CHECK-NEXT: [[SH:%.*]] = lshr i32 [[AND]], 8 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SH]], [[AND]] +; CHECK-NEXT: [[CONV4:%.*]] = trunc i32 [[OR]] to i16 +; CHECK-NEXT: store i16 [[CONV4]], i16* [[A]], align 2 +; CHECK-NEXT: ret void +; +entry: + %ld = load i16, i16* %a, align 2 + %conv = sext i16 %ld to i32 + %and = and i32 %conv, 65280 + %sh = lshr i32 %conv, 8 + %and3 = and i32 %sh, 255 + %or = or i32 %and3, %and + %conv4 = trunc i32 %or to i16 + store i16 %conv4, i16* %a, align 2 + ret void +} + +; Check that all uses of %dead are replaced +define i32 @mulUses(i32 %a) { +; CHECK-LABEL: @mulUses( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND:%.*]] = and i32 [[A:%.*]], 44032 +; CHECK-NEXT: [[SH:%.*]] = lshr i32 [[AND]], 8 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SH]], [[AND]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[OR]], [[SH]] +; CHECK-NEXT: [[OUT:%.*]] = mul i32 [[ADD]], [[SH]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %sh = lshr i32 %a, 8 + %dead = and i32 %sh, 172 + %and = and i32 %a, 44032 + %or = or i32 %dead, %and + %add = add i32 %or, %dead + %out = mul i32 %add, %dead + ret i32 %out +} + +; If the shift operation is used elsewhere, it must live +define i32 @mulUsesShift_no_good(i32 %a) { +; CHECK-LABEL: @mulUsesShift_no_good( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SH:%.*]] = lshr i32 [[A:%.*]], 8 +; CHECK-NEXT: [[NOTDEAD:%.*]] = and i32 [[SH]], 172 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[A]], 44032 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOTDEAD]], [[AND]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[OR]], [[NOTDEAD]] +; CHECK-NEXT: [[OUT:%.*]] = mul i32 [[ADD]], [[SH]] +; CHECK-NEXT: ret i32 [[OUT]] +; +entry: + %sh = lshr i32 %a, 8 + %notdead = and i32 %sh, 172 + %and = and i32 %a, 44032 + %or = or i32 %notdead, %and + %add = add i32 %or, %notdead + %out = mul i32 %add, %sh + ret i32 %out +} + +define void @shift_r1(i32 %x) { +; CHECK-LABEL: @shift_r1( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 172 +; CHECK-NEXT: [[SH:%.*]] = shl i32 [[AND]], 8 +; CHECK-NEXT: [[CL1:%.*]] = call i32 @mulUses(i32 [[SH]]) +; CHECK-NEXT: [[CL2:%.*]] = call i32 @mulUses(i32 [[AND]]) +; CHECK-NEXT: ret void +; + %sh = shl i32 %x, 8 + %dead = and i32 %sh, 44032 + %cl1 = call i32 @mulUses(i32 %dead) + %and = and i32 %x, 172 + %cl2 = call i32 @mulUses(i32 %and) + ret void +} +