Index: include/llvm/Analysis/MemorySSAUpdater.h =================================================================== --- include/llvm/Analysis/MemorySSAUpdater.h +++ include/llvm/Analysis/MemorySSAUpdater.h @@ -134,6 +134,9 @@ /// load is simply erased (not replaced), removeMemoryAccess should be called /// on the MemoryAccess for that store/load. void removeMemoryAccess(MemoryAccess *); + /// If the instruction has a MemoryAccess associated with it, remove + /// the MemoryAccess from MemorySSA. + void removeMemoryAccess(Instruction *); private: // Move What before Where in the MemorySSA IR. Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -64,6 +64,9 @@ MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag, bool AbortOnFailedISel); + /// This pass shrinks some load/store to narrower legal type accesses. + FunctionPass *createMemAccessShrinkingPass(const TargetMachine *TM = nullptr); + /// createCodeGenPreparePass - Transform the code to expose more pattern /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr); Index: include/llvm/IR/PatternMatch.h =================================================================== --- include/llvm/IR/PatternMatch.h +++ include/llvm/IR/PatternMatch.h @@ -317,6 +317,9 @@ /// \brief Match a ConstantInt, capturing the value if we match. inline bind_ty m_ConstantInt(ConstantInt *&CI) { return CI; } +/// \brief Match a load instruction, capturing the value if we match. +inline bind_ty m_Load(LoadInst *&LI) { return LI; } + /// \brief Match a Constant, capturing the value if we match. inline bind_ty m_Constant(Constant *&C) { return C; } Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -246,6 +246,7 @@ void initializeMachineSinkingPass(PassRegistry&); void initializeMachineTraceMetricsPass(PassRegistry&); void initializeMachineVerifierPassPass(PassRegistry&); +void initializeMemAccessShrinkingPassPass(PassRegistry &); void initializeMemCpyOptLegacyPassPass(PassRegistry&); void initializeMemDepPrinterPass(PassRegistry&); void initializeMemDerefPrinterPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -168,6 +168,7 @@ (void) llvm::createCountingFunctionInserterPass(); (void) llvm::createEarlyCSEPass(); (void) llvm::createGVNHoistPass(); + (void) llvm::createMemAccessShrinkingPass(); (void) llvm::createMergedLoadStoreMotionPass(); (void) llvm::createGVNPass(); (void) llvm::createNewGVNPass(); Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1965,6 +1965,14 @@ return false; } + /// Return true if it's expensive to narrow operations of type VT1 to + /// VT2. Sometimes we want to try load/store shrinking to save extra + /// bit manipulation operations. We need to make sure the shrinking will + /// not be so expensive to offset the benefit. + virtual bool isNarrowingExpensive(EVT /*VT1*/, EVT /*VT2*/) const { + return false; + } + /// \brief Return true if it is beneficial to convert a load of a constant to /// just the constant itself. /// On some targets it might be more efficient to use a combination of Index: include/llvm/Transforms/Utils/Local.h =================================================================== --- include/llvm/Transforms/Utils/Local.h +++ include/llvm/Transforms/Utils/Local.h @@ -15,13 +15,14 @@ #ifndef LLVM_TRANSFORMS_UTILS_LOCAL_H #define LLVM_TRANSFORMS_UTILS_LOCAL_H +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Operator.h" -#include "llvm/ADT/SmallPtrSet.h" namespace llvm { @@ -81,8 +82,9 @@ /// If the specified value is a trivially dead instruction, delete it. /// If that makes any of its operands trivially dead, delete them too, /// recursively. Return true if any instructions were deleted. -bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, - const TargetLibraryInfo *TLI = nullptr); +bool RecursivelyDeleteTriviallyDeadInstructions( + Value *V, const TargetLibraryInfo *TLI = nullptr, + MemorySSAUpdater *MSSAUpdater = nullptr); /// If the specified value is an effectively dead PHI node, due to being a /// def-use chain of single-use nodes that either forms a cycle or is terminated Index: lib/Analysis/MemorySSAUpdater.cpp =================================================================== --- lib/Analysis/MemorySSAUpdater.cpp +++ lib/Analysis/MemorySSAUpdater.cpp @@ -464,6 +464,12 @@ MSSA->removeFromLists(MA); } +void MemorySSAUpdater::removeMemoryAccess(Instruction *I) { + MemoryAccess *MemAcc = MSSA->getMemoryAccess(I); + if (MemAcc) + removeMemoryAccess(MemAcc); +} + MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB( Instruction *I, MemoryAccess *Definition, const BasicBlock *BB, MemorySSA::InsertionPlace Point) { Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -86,6 +86,7 @@ MachineSSAUpdater.cpp MachineTraceMetrics.cpp MachineVerifier.cpp + MemAccessShrinking.cpp PatchableFunction.cpp MIRPrinter.cpp MIRPrintingPass.cpp Index: lib/CodeGen/CodeGen.cpp =================================================================== --- lib/CodeGen/CodeGen.cpp +++ lib/CodeGen/CodeGen.cpp @@ -66,6 +66,7 @@ initializeMachineSchedulerPass(Registry); initializeMachineSinkingPass(Registry); initializeMachineVerifierPassPass(Registry); + initializeMemAccessShrinkingPassPass(Registry); initializeOptimizePHIsPass(Registry); initializePEIPass(Registry); initializePHIEliminationPass(Registry); Index: lib/CodeGen/MemAccessShrinking.cpp =================================================================== --- lib/CodeGen/MemAccessShrinking.cpp +++ lib/CodeGen/MemAccessShrinking.cpp @@ -0,0 +1,1189 @@ +//===- MemAccessShrinking.cpp - Shrink the type of mem accesses -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// The pass tries to find opportunities where we can save bit manipulation +/// operations or convert illegal type operations to legal type by shrinking +/// the type of memory accesses. +/// +/// A major motivation of the optimization: +/// Consecutive bitfields are wrapped as a group and represented as a +/// large integer. To access a specific bitfield, wide load/store plus a +/// series of bit operations are needed. However, if the bitfield is of legal +/// type, it is more efficient to access it directly. Another case is the +/// bitfield group may be of illegal integer type, if we only access one +/// bitfield in the group, we may have chance to shrink the illegal type +/// to a smaller legal type, and save some bit mask operations. +/// Although the major motivation of the optimization is about bitfield +/// accesses, it is not limited to them and can be applied to general memory +/// accesses with similar patterns. +/// +/// Description about the optimization: +/// The optimization have three basic categories. The first category is store +/// shrinking. If we are storing a large value, with some of its consecutive +/// bits replaced by a small value through bit manipulation, we can use two +/// stores, one for the large value and one for the small, to replace the +/// original store and bit manipulation operations. Even better if the large +/// value is got from a load with the same address as the store, the store +/// to the large value can be saved too. +/// +/// The second category is another kind of store shrinking. We are looking +/// for the operations sequence like: load a large value, adjust some bits +/// of the value and then store it back to the same address as the load. +/// The essence of the operation sequence is to change some bits of the +/// large value but keep the rest part of it the same. We may shrink the +/// size of both the load and store, and the bit manipulation operations +/// in the middle of them, if only the bits to be changed are still included. +/// This is especially useful when the load and store are of illegal type. +/// Shrinking the load and store to a smaller legal type can save us many +/// extra operations to be generated in ISel legalization. +/// +/// The third category is load use shrinking. If we are loading a large +/// value and using it to do a series of operations, but finally we only +/// use a small part of the result value, we can shrink the size of the +/// whole chain including the load. +/// +/// Some notes about the algorithm: +/// * The algorithm scans the program and tries to recognize and transform the +/// patterns above. It runs iteratively until no more changes have happened. +/// * To prevent the optimization from blocking load/store coalescing, it is +/// invoked late in the pipeline, just before CodeGenPrepare. At this late +/// stage, both the pattern matching and related safety check become more +/// difficult because of previous optimizations introducing merged loads and +/// stores and more complex control flow. That is why MemorySSA is used here. +/// It provides scalable and precise safety checks even when we try to insert +/// a smaller access into a block which is many blocks away from the original +/// access. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "memaccessshrink" + +using namespace llvm; +using namespace llvm::PatternMatch; + +STATISTIC(NumStoreShrunkBySplit, "Number of stores shrunk by splitting"); +STATISTIC(NumStoreShrunkToLegal, "Number of stores shrunk to legal types"); +STATISTIC(NumLoadShrunk, "Number of Loads shrunk"); + +namespace { +/// Describe the bit range of a value. +struct BitRange { + unsigned Shift; + unsigned Width; + bool isDisjoint(BitRange &BR); +}; + +/// \brief The memory access shrinking pass. +class MemAccessShrinkingPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + MemAccessShrinkingPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeMemAccessShrinkingPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + StringRef getPassName() const override { return "MemAccess Shrinking"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + } + + struct StoreShrunkInfo { + StoreInst &SI; + Value *LargeVal; + Value *SmallVal; + ConstantInt *Cst; + BitRange &BR; + StoreShrunkInfo(StoreInst &SI, Value *LargeVal, Value *SmallVal, + ConstantInt *Cst, BitRange &BR) + : SI(SI), LargeVal(LargeVal), SmallVal(SmallVal), Cst(Cst), BR(BR) {} + }; + +private: + const DataLayout *DL = nullptr; + const DominatorTree *DT = nullptr; + const TargetMachine *TM = nullptr; + const TargetLowering *TLI = nullptr; + const TargetLibraryInfo *TLInfo = nullptr; + LLVMContext *Context; + + MemorySSA *MSSA; + MemorySSAWalker *MSSAWalker; + std::unique_ptr MSSAUpdater; + + // Candidate instructions to be erased if they have no other uses. + SmallPtrSet CandidatesToErase; + + // MultiUsesSeen shows a multiuse node is found on the Def-Use Chain. + bool MultiUsesSeen; + // SavedInsts shows how many instructions saved for a specific shrink so far. + // It is used to evaluate whether some instructions will be saved for the + // shrinking especially when MultiUsesSeen is true. + unsigned SavedInsts; + + BitRange analyzeOrAndPattern(Value &SmallVal, ConstantInt &Cst, + unsigned BitSize, bool &DoReplacement); + BitRange analyzeBOpPattern(Value &Val, ConstantInt &Cst, unsigned BitSize); + bool extendBitRange(BitRange &BR, unsigned BitSize, unsigned Align); + bool hasClobberBetween(Instruction &From, Instruction &To); + bool needNewStore(Value &LargeVal, StoreInst &SI); + Value *createNewPtr(Value *Ptr, unsigned StOffset, Type *NewTy, + IRBuilder<> &Builder); + + bool findBRWithLegalType(StoreInst &SI, BitRange &BR); + bool isLegalToShrinkStore(LoadInst &LI, StoreInst &SI); + bool tryShrinkStoreBySplit(StoreInst &SI); + bool tryShrinkStoreToLegalTy(StoreInst &SI); + bool splitStoreIntoTwo(StoreShrunkInfo &SInfo); + bool shrinkStoreToLegalType(StoreShrunkInfo &SInfo); + bool tryShrinkLoadUse(Instruction &IN); + bool tryShrink(Function &Fn); + Value *shrinkInsts(Value *NewPtr, BitRange &BR, + SmallVectorImpl &Insts, + bool AllClobberInToBlock, unsigned ResShift, + IRBuilder<> &Builder); + bool matchInstsToShrink(Value *Val, BitRange &BR, + SmallVectorImpl &Insts); + bool hasClobberBetween(Instruction &From, Instruction &To, + const MemoryLocation &Mem, bool &AllClobberInToBlock); + void addSaveCandidate(Instruction &Inst, bool ReplaceAllUses = false); + bool isBeneficial(unsigned ResShift, SmallVectorImpl &Insts); + void markCandForDeletion(Instruction &I) { CandidatesToErase.insert(&I); } + bool removeDeadInsts(); +}; +} // namespace + +char MemAccessShrinkingPass::ID = 0; +INITIALIZE_TM_PASS_BEGIN(MemAccessShrinkingPass, "memaccessshrink", + "MemAccess Shrinking", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_TM_PASS_END(MemAccessShrinkingPass, "memaccessshrink", + "MemAccess Shrinking", false, false) + +FunctionPass *llvm::createMemAccessShrinkingPass(const TargetMachine *TM) { + return new MemAccessShrinkingPass(TM); +} + +/// Analyze pattern or(and(LargeVal, \p Cst), \p SmallVal). A common usage +/// of the pattern is to replace some consecutive bits in LargeVal with +/// nonzero bits of SmallVal, but to do the replacement as described, +/// some requirements have to be satisfied like that \p Cst has to contain +/// a consecutive run of zeros, and that all the nonzero bits of \p SmallVal +/// should be inside the range of consecutive zeros bits of \p Cst. +/// \p DoReplacement will contain the result of whether the requirements are +/// satisfied. We will also return BitRange describing the maximum range of +/// LargeVal that may be modified. +BitRange MemAccessShrinkingPass::analyzeOrAndPattern(Value &SmallVal, + ConstantInt &Cst, + unsigned BitSize, + bool &DoReplacement) { + // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We + // will handle patterns like either 0..01..1 or 1..10..01..1 + APInt Mask = Cst.getValue().sextOrTrunc(BitSize); + assert(Mask.getBitWidth() == BitSize && "Unexpected bitwidth of Mask"); + unsigned MaskLeadOnes = Mask.countLeadingOnes(); + unsigned MaskTrailOnes = Mask.countTrailingOnes(); + unsigned MaskMidZeros = !MaskLeadOnes + ? Mask.countLeadingZeros() + : Mask.ashr(MaskTrailOnes).countTrailingZeros(); + // Replace some consecutive bits in LargeVal with all the bits of SmallVal. + DoReplacement = true; + if (MaskLeadOnes == BitSize) { + MaskMidZeros = 0; + DoReplacement = false; + } else if (MaskLeadOnes + MaskMidZeros + MaskTrailOnes != BitSize) { + // See if we have a continuous run of zeros. + MaskMidZeros = BitSize - MaskLeadOnes - MaskTrailOnes; + DoReplacement = false; + } + + // Check SmallVal only provides nonzero bits within range from lowbits + // (MaskTrailOnes) to highbits (MaskTrailOnes + MaskMidZeros). + APInt BitMask = + ~APInt::getBitsSet(BitSize, MaskTrailOnes, MaskTrailOnes + MaskMidZeros); + + // Find out the range in which 1 appears in SmallVal. + APInt KnownOne(BitSize, 0), KnownZero(BitSize, 0); + computeKnownBits(&SmallVal, KnownZero, KnownOne, *DL, 0); + + BitRange BR; + BR.Shift = MaskTrailOnes; + BR.Width = MaskMidZeros; + // Expect the bits being 1 in BitMask are all KnownZero bits in SmallVal, + // otherwise we need to set ShrinkWithMaskedVal to false and expand BR. + if ((KnownZero & BitMask) != BitMask) { + DoReplacement = false; + // Lower is the first bit for which we are not sure about the fact of + // its being zero. + unsigned Lower = KnownZero.countTrailingOnes(); + // Higher is the last bit for which we are not sure about the fact of + // its being zero. + unsigned Higher = BitSize - KnownZero.countLeadingOnes(); + BR.Shift = std::min(Lower, MaskTrailOnes); + BR.Width = std::max(Higher, MaskTrailOnes + MaskMidZeros) - BR.Shift; + } + return BR; +} + +/// Analyze pattern or/xor/and(load P, \p Cst). The result of the pattern +/// is a partially changed load P. The func will return the the range +/// showing where the load result are changed. +BitRange MemAccessShrinkingPass::analyzeBOpPattern(Value &Val, ConstantInt &Cst, + unsigned BitSize) { + APInt Mask = Cst.getValue().sextOrTrunc(BitSize); + BinaryOperator *BOP = cast(&Val); + if (BOP->getOpcode() == Instruction::And) + Mask = ~Mask; + + BitRange BR; + BR.Shift = Mask.countTrailingZeros(); + BR.Width = Mask.getBitWidth() - BR.Shift; + if (BR.Width) + BR.Width = BR.Width - Mask.countLeadingZeros(); + return BR; +} + +/// Extend \p BR so the new BR.Width bits can form a legal type. +bool MemAccessShrinkingPass::extendBitRange(BitRange &BR, unsigned BitSize, + unsigned Align) { + BitRange NewBR; + NewBR.Width = PowerOf2Ceil(BR.Width); + Type *NewTy = Type::getIntNTy(*Context, NewBR.Width); + Type *StoreTy = Type::getIntNTy(*Context, PowerOf2Ceil(BitSize)); + + // Check if we can find a new Shift for the Width of NewBR, so that + // NewBR forms a new range covering the old modified range without + // worsening alignment. + auto coverOldRange = [&](BitRange &NewBR, BitRange &BR) -> bool { + unsigned MAlign = MinAlign(Align, DL->getABITypeAlignment(NewTy)); + int Shift = BR.Shift - BR.Shift % (MAlign * 8); + while (Shift >= 0) { + if (NewBR.Width + Shift <= BitSize && + NewBR.Width + Shift >= BR.Width + BR.Shift) { + NewBR.Shift = Shift; + return true; + } + Shift -= MAlign * 8; + } + return false; + }; + // See whether we can store NewTy legally. + auto isStoreLegalType = [&](Type *NewTy) -> bool { + EVT OldEVT = TLI->getValueType(*DL, StoreTy); + EVT NewEVT = TLI->getValueType(*DL, NewTy); + return TLI->isOperationLegalOrCustom(ISD::STORE, NewEVT) || + TLI->isTruncStoreLegalOrCustom(OldEVT, NewEVT); + }; + + // Try to find the minimal NewBR.Width which can form a legal type and cover + // all the old modified bits. + while (NewBR.Width < BitSize && + (!isStoreLegalType(NewTy) || + TLI->isNarrowingExpensive(EVT::getEVT(StoreTy), EVT::getEVT(NewTy)) || + !coverOldRange(NewBR, BR))) { + NewBR.Width = NextPowerOf2(NewBR.Width); + NewTy = Type::getIntNTy(*Context, NewBR.Width); + } + BR.Width = NewBR.Width; + BR.Shift = NewBR.Shift; + + if (BR.Width >= BitSize) + return false; + return true; +} + +/// Compute the offset used to compute the new ptr address. It will be +/// mainly based on BR and the endian of the target. +static unsigned computeBitOffset(BitRange &BR, unsigned BitSize, + const DataLayout &DL) { + unsigned BitOffset; + if (DL.isBigEndian()) + BitOffset = BitSize - BR.Shift - BR.Width; + else + BitOffset = BR.Shift; + + return BitOffset; +} + +static unsigned computeByteOffset(BitRange &BR, unsigned BitSize, + const DataLayout &DL) { + return computeBitOffset(BR, BitSize, DL) / 8; +} + +/// Check whether V1 and V2 has the same ptr value by looking through bitcast. +static bool hasSamePtr(Value *V1, Value *V2) { + Value *NV1 = nullptr; + Value *NV2 = nullptr; + if (V1 == V2) + return true; + if (BitCastInst *BC1 = dyn_cast(V1)) + NV1 = BC1->getOperand(0); + if (BitCastInst *BC2 = dyn_cast(V2)) + NV2 = BC2->getOperand(0); + if (!NV1 && !NV2) + return false; + if (V1 == NV2 || V2 == NV1 || NV1 == NV2) + return true; + return false; +} + +/// Check whether there is any clobber instruction between \p From and \p To +/// for the memory location accessed by \p To. +bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From, + Instruction &To) { + assert(MSSA->getMemoryAccess(&To) && "Expect To has valid MemoryAccess"); + MemoryAccess *FromAccess = MSSA->getMemoryAccess(&From); + assert(FromAccess && "Expect From has valid MemoryAccess"); + MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(&To); + if (FromAccess != DefiningAccess && + MSSA->dominates(FromAccess, DefiningAccess)) + return true; + return false; +} + +/// It is possible that we already have a store of LargeVal and it is not +/// clobbered, then we can use it and don't have to generate a a new store. +bool MemAccessShrinkingPass::needNewStore(Value &LargeVal, StoreInst &SI) { + for (User *U : LargeVal.users()) { + StoreInst *PrevSI = dyn_cast(U); + if (!PrevSI || + !hasSamePtr(PrevSI->getPointerOperand(), SI.getPointerOperand()) || + PrevSI->getValueOperand()->getType() != SI.getValueOperand()->getType()) + continue; + if (!hasClobberBetween(*PrevSI, SI)) + return false; + } + return true; +} + +/// Create new address to be used by shrunk load/store based on original +/// address \p Ptr, offset \p StOffset and the new type \p NewTy. +Value *MemAccessShrinkingPass::createNewPtr(Value *Ptr, unsigned StOffset, + Type *NewTy, IRBuilder<> &Builder) { + Value *NewPtr = Ptr; + unsigned AS = cast(Ptr->getType())->getAddressSpace(); + if (StOffset) { + ConstantInt *Idx = ConstantInt::get(Type::getInt32Ty(*Context), StOffset); + NewPtr = + Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(*Context, AS), "cast"); + NewPtr = + Builder.CreateGEP(Type::getInt8Ty(*Context), NewPtr, Idx, "uglygep"); + } + return Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS), "cast"); +} + +/// Check if it is legal to shrink the store if the input LargeVal is a +/// LoadInst. +bool MemAccessShrinkingPass::isLegalToShrinkStore(LoadInst &LI, StoreInst &SI) { + Value *Ptr = SI.getOperand(1); + // LI should have the same address as SI. + if (!hasSamePtr(LI.getPointerOperand(), Ptr)) + return false; + if (LI.isVolatile() || !LI.isUnordered()) + return false; + // Make sure the memory location of LI/SI is not clobbered between them. + if (hasClobberBetween(LI, SI)) + return false; + return true; +} + +/// we try to handle is when a smaller value is "inserted" into some bits +/// of a larger value. This typically looks something like: +/// +/// store(or(and(LargeVal, MaskConstant), SmallVal)), address) +/// +/// Here, we try to use a narrow store of `SmallVal` rather than bit +/// operations to combine them: +/// +/// store(LargeVal, address) +/// store(SmallVal, address) +/// +/// Or if `LargeVal` was a load, we may not need to store it at all: +/// +/// store(SmallVal, address) +/// +/// We may also be able incorporate shifts of `SmallVal` by using an offset +/// of `address`. +/// +/// However, this may not be valid if, for example, the size of `SmallVal` +/// isn't a valid type to store or the shift can't be modeled as a valid +/// offset from the address, then we can still try another transformation +/// which is to shrink the store to smaller legal width when the original +/// width of the store is illegal. +bool MemAccessShrinkingPass::tryShrinkStoreBySplit(StoreInst &SI) { + Value *Val = SI.getOperand(0); + Type *StoreTy = Val->getType(); + if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy()) + return false; + if (SI.isVolatile() || !SI.isUnordered()) + return false; + unsigned BitSize = DL->getTypeSizeInBits(StoreTy); + if (BitSize != DL->getTypeStoreSizeInBits(StoreTy)) + return false; + + Value *LargeVal; + Value *SmallVal; + ConstantInt *Cst; + if (!(match(Val, m_Or(m_And(m_Value(LargeVal), m_ConstantInt(Cst)), + m_Value(SmallVal))) && + match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) && + !(match(Val, m_Or(m_Value(SmallVal), + m_And(m_Value(LargeVal), m_ConstantInt(Cst)))) && + match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) && + !match(Val, m_c_Or(m_And(m_Value(LargeVal), m_ConstantInt(Cst)), + m_Value(SmallVal)))) + return false; + + LoadInst *LI = dyn_cast(LargeVal); + if (LI && !isLegalToShrinkStore(*LI, SI)) + return false; + + // Return BR which indicates the range of the LargeVal that will actually + // be modified. If the pattern does the work to replace portion of LargeVal + // with bits from SmallVal, we can do the split store transformation. + // Otherwise, we can only try shrink the store to legal type transformation. + // ToSplitStore will be populated to reflect that. + bool ToSplitStore; + BitRange BR = analyzeOrAndPattern(*SmallVal, *Cst, BitSize, ToSplitStore); + assert(BR.Shift + BR.Width <= BitSize && "Unexpected BitRange!"); + if (!BR.Width) + return false; + + if (ToSplitStore) { + // Get the offset from Ptr for the shrunk store. + unsigned StoreBitOffset = computeBitOffset(BR, BitSize, *DL); + if (StoreBitOffset % 8 != 0) + ToSplitStore = false; + + // If BR.Width is not the length of legal integer type, we cannot + // store SmallVal directly. + if (BR.Width != 8 && BR.Width != 16 && BR.Width != 32 && BR.Width != 64) + ToSplitStore = false; + } + + if (!ToSplitStore) { + if (!LI) + return false; + if (!findBRWithLegalType(SI, BR)) + return false; + StoreShrunkInfo SInfo(SI, LI, SmallVal, Cst, BR); + shrinkStoreToLegalType(SInfo); + return true; + } + IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width); + if (TLI->isNarrowingExpensive(EVT::getEVT(StoreTy), EVT::getEVT(NewTy))) + return false; + + StoreShrunkInfo SInfo(SI, LargeVal, SmallVal, Cst, BR); + splitStoreIntoTwo(SInfo); + return true; +} + +/// The pattern we try to shrink (which may also apply to code matching +/// the tryShrinkStoreBySplit pattern when that transformation isn't valid) +/// is to shrink the inputs of basic bit operations (or, and, xor) and then +/// store the smaller width result. This is valid whenever we know that +/// shrinking the inputs and operations doesn't change the result. For example, +/// if the removed bits are known to be 0s, 1s, or undef, we may be able to +/// avoid the bitwise computation on them. This is particularly useful when +/// the type width is not a legal type, and when the inputs are loads and +/// constants. +bool MemAccessShrinkingPass::tryShrinkStoreToLegalTy(StoreInst &SI) { + Value *Val = SI.getOperand(0); + Type *StoreTy = Val->getType(); + if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy()) + return false; + if (SI.isVolatile() || !SI.isUnordered()) + return false; + unsigned BitSize = DL->getTypeSizeInBits(StoreTy); + if (BitSize != DL->getTypeStoreSizeInBits(StoreTy)) + return false; + + LoadInst *LI; + ConstantInt *Cst; + if (!match(Val, m_c_Or(m_Load(LI), m_ConstantInt(Cst))) && + !match(Val, m_c_And(m_Load(LI), m_ConstantInt(Cst))) && + !match(Val, m_c_Xor(m_Load(LI), m_ConstantInt(Cst)))) + return false; + + if (!isLegalToShrinkStore(*LI, SI)) + return false; + + // Find out the range of Val which is changed. + BitRange BR = analyzeBOpPattern(*Val, *Cst, BitSize); + assert(BR.Shift + BR.Width <= BitSize && "Unexpected BitRange!"); + if (!BR.Width) + return false; + if (!findBRWithLegalType(SI, BR)) + return false; + + StoreShrunkInfo SInfo(SI, LI, nullptr, Cst, BR); + shrinkStoreToLegalType(SInfo); + return true; +} + +/// Try to extend the existing BitRange \BR to legal integer width. +bool MemAccessShrinkingPass::findBRWithLegalType(StoreInst &SI, BitRange &BR) { + Type *StoreTy = SI.getOperand(0)->getType(); + unsigned Align = SI.getAlignment(); + unsigned BitSize = DL->getTypeSizeInBits(StoreTy); + // Check whether the store is of illegal type. If it is not, don't bother. + if (!TLI || TLI->isOperationLegalOrCustom(ISD::STORE, + TLI->getValueType(*DL, StoreTy))) + return false; + // Try to find out a BR of legal type. + if (!extendBitRange(BR, BitSize, Align)) + return false; + return true; +} + +/// The transformation to split the store into two: one is for the LargeVal +/// and one is for the SmallVal. The first store can be saved if the LargeVal +/// is got from a load or there is an existing store for the LargeVal. +bool MemAccessShrinkingPass::splitStoreIntoTwo(StoreShrunkInfo &SInfo) { + BitRange &BR = SInfo.BR; + StoreInst &SI = SInfo.SI; + IRBuilder<> Builder(*Context); + Builder.SetInsertPoint(&SI); + Value *Val = SI.getOperand(0); + Value *Ptr = SI.getOperand(1); + + Type *StoreTy = SI.getOperand(0)->getType(); + unsigned BitSize = DL->getTypeSizeInBits(StoreTy); + unsigned StoreByteOffset = computeByteOffset(BR, BitSize, *DL); + unsigned Align = SI.getAlignment(); + if (StoreByteOffset) + Align = MinAlign(StoreByteOffset, Align); + + IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width); + Value *NewPtr = createNewPtr(Ptr, StoreByteOffset, NewTy, Builder); + + // Check if we need to split the original store and generate a new + // store for the LargeVal. + if (!isa(SInfo.LargeVal) && needNewStore(*SInfo.LargeVal, SI)) { + StoreInst *NewSI = cast(SI.clone()); + NewSI->setOperand(0, SInfo.LargeVal); + NewSI->setOperand(1, Ptr); + Builder.Insert(NewSI); + DEBUG(dbgs() << "MemShrink: Insert" << *NewSI << " before" << SI << "\n"); + // MemorySSA update for the new store. + MemoryDef *OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + MemoryDef *NewMemAcc = cast( + MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc)); + MSSAUpdater->insertDef(NewMemAcc, false); + } + + // Create the New Value to store. + Value *SmallVal = nullptr; + if (auto MVCst = dyn_cast(SInfo.SmallVal)) { + APInt ModifiedCst = MVCst->getValue().lshr(BR.Shift).trunc(BR.Width); + SmallVal = ConstantInt::get(*Context, ModifiedCst); + } else { + Value *ShiftedVal = + BR.Shift ? Builder.CreateLShr(SInfo.SmallVal, BR.Shift, "lshr") + : SInfo.SmallVal; + SmallVal = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc"); + } + + // Create the new store and remove the old one. + StoreInst *NewSI = cast(SI.clone()); + NewSI->setOperand(0, SmallVal); + NewSI->setOperand(1, NewPtr); + NewSI->setAlignment(Align); + Builder.Insert(NewSI); + + DEBUG(dbgs() << "MemShrink: Replace" << SI << " with" << *NewSI << "\n"); + // MemorySSA update for the shrunk store. + MemoryDef *OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + MemoryAccess *NewMemAcc = + MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc); + OldMemAcc->replaceAllUsesWith(NewMemAcc); + MSSAUpdater->removeMemoryAccess(OldMemAcc); + + markCandForDeletion(*cast(Val)); + SI.eraseFromParent(); + NumStoreShrunkBySplit++; + return true; +} + +/// The transformation to shrink the value of the store to smaller width. +bool MemAccessShrinkingPass::shrinkStoreToLegalType(StoreShrunkInfo &SInfo) { + BitRange &BR = SInfo.BR; + StoreInst &SI = SInfo.SI; + IRBuilder<> Builder(*Context); + Builder.SetInsertPoint(&SI); + Value *Val = SI.getOperand(0); + Value *Ptr = SI.getOperand(1); + + Type *StoreTy = SI.getOperand(0)->getType(); + unsigned BitSize = DL->getTypeSizeInBits(StoreTy); + unsigned StoreByteOffset = computeByteOffset(BR, BitSize, *DL); + unsigned Align = SI.getAlignment(); + if (StoreByteOffset) + Align = MinAlign(StoreByteOffset, Align); + + IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width); + Value *NewPtr = createNewPtr(Ptr, StoreByteOffset, NewTy, Builder); + + LoadInst *NewLI = cast(cast(SInfo.LargeVal)->clone()); + NewLI->mutateType(NewTy); + NewLI->setOperand(0, NewPtr); + NewLI->setAlignment(Align); + Builder.Insert(NewLI, "load.trunc"); + // MemorySSA update for the shrunk load. + MemoryDef *OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + MSSAUpdater->createMemoryAccessBefore(NewLI, Def, OldMemAcc); + + // Create the SmallVal to store. + Value *SmallVal = nullptr; + APInt ModifiedCst = SInfo.Cst->getValue().lshr(BR.Shift).trunc(BR.Width); + ConstantInt *NewCst = ConstantInt::get(*Context, ModifiedCst); + if (SInfo.SmallVal) { + // Use SInfo.SmallVal to get the SmallVal. + Value *Trunc; + if (auto MVCst = dyn_cast(SInfo.SmallVal)) { + ModifiedCst = MVCst->getValue().lshr(BR.Shift).trunc(BR.Width); + Trunc = ConstantInt::get(*Context, ModifiedCst); + } else { + Value *ShiftedVal = + BR.Shift ? Builder.CreateLShr(SInfo.SmallVal, BR.Shift, "lshr") + : SInfo.SmallVal; + Trunc = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc"); + } + Value *NewAnd = Builder.CreateAnd(NewLI, NewCst, "and.trunc"); + SmallVal = Builder.CreateOr(NewAnd, Trunc, "or.trunc"); + } else { + BinaryOperator *BOP = cast(Val); + switch (BOP->getOpcode()) { + default: + llvm_unreachable("BOP can only be And/Or/Xor"); + case Instruction::And: + SmallVal = Builder.CreateAnd(NewLI, NewCst, "and.trunc"); + break; + case Instruction::Or: + SmallVal = Builder.CreateOr(NewLI, NewCst, "or.trunc"); + break; + case Instruction::Xor: + SmallVal = Builder.CreateXor(NewLI, NewCst, "xor.trunc"); + break; + } + } + + // Create the new store and remove the old one. + StoreInst *NewSI = cast(SI.clone()); + NewSI->setOperand(0, SmallVal); + NewSI->setOperand(1, NewPtr); + NewSI->setAlignment(Align); + Builder.Insert(NewSI); + + DEBUG(dbgs() << "MemShrink: Replace" << SI << " with" << *NewSI << "\n"); + // MemorySSA update for the shrunk store. + OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + Def = OldMemAcc->getDefiningAccess(); + MemoryAccess *NewMemAcc = + MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc); + OldMemAcc->replaceAllUsesWith(NewMemAcc); + MSSAUpdater->removeMemoryAccess(OldMemAcc); + + markCandForDeletion(*cast(Val)); + SI.eraseFromParent(); + NumStoreShrunkToLegal++; + return true; +} + +/// AI contains a consecutive run of 1. Check the range \p BR of \p AI are all +/// 1s. +static bool isRangeofAIAllOnes(BitRange &BR, APInt &AI) { + if (BR.Shift < AI.countTrailingZeros() || + BR.Width + BR.Shift > (AI.getBitWidth() - AI.countLeadingZeros())) + return false; + return true; +} + +bool BitRange::isDisjoint(BitRange &BR) { + return (Shift > BR.Shift + BR.Width - 1) || (BR.Shift > Shift + Width - 1); +} + +/// Check if there is no instruction between \p From and \p To which may +/// clobber the MemoryLocation \p Mem. However, if there are clobbers and +/// all the clobber instructions between \p From and \p To are in the same +/// block as \p To, We will set \p AllClobberInToBlock to true. +bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From, + Instruction &To, + const MemoryLocation &Mem, + bool &AllClobberInToBlock) { + assert(DT->dominates(&From, &To) && "From doesn't dominate To"); + const BasicBlock *FBB = From.getParent(); + const BasicBlock *TBB = To.getParent(); + // Collect BasicBlocks to scan between FBB and TBB into BBSet. + SmallPtrSet BBSet; + SmallVector Worklist; + BBSet.insert(FBB); + BBSet.insert(TBB); + Worklist.push_back(TBB); + do { + const BasicBlock *BB = Worklist.pop_back_val(); + for (const BasicBlock *Pred : predecessors(BB)) { + if (!BBSet.count(Pred)) { + BBSet.insert(Pred); + Worklist.push_back(Pred); + } + } + } while (!Worklist.empty()); + + // Check the DefsList inside of each BB. + bool hasClobber = false; + AllClobberInToBlock = true; + for (const BasicBlock *BB : BBSet) { + const MemorySSA::DefsList *DList = MSSA->getBlockDefs(BB); + if (!DList) + continue; + + for (const MemoryAccess &MA : *DList) { + if (const MemoryDef *MD = dyn_cast(&MA)) { + Instruction *Inst = MD->getMemoryInst(); + if ((FBB == Inst->getParent() && DT->dominates(Inst, &From)) || + (TBB == Inst->getParent() && DT->dominates(&To, Inst))) + continue; + // Check whether MD is a clobber of Mem. + if (MSSAWalker->getClobberingMemoryAccess(const_cast(MD), + Mem) == MD) { + if (Inst->getParent() != TBB) + AllClobberInToBlock = false; + hasClobber = true; + } + } + } + } + if (!hasClobber) + AllClobberInToBlock = false; + return hasClobber; +} + +/// Match \p Val to the pattern: +/// Bop(...Bop(V, Cst_1), ..., Cst_N) and pattern: +/// or(and(...or(and(LI, Cst_1), SmallVal_1), ..., Cse_N), SmallVal_N), +/// and find the final load to be shrunk. +/// All the Bop instructions in the first pattern and the final load will +/// be added into \p Insts and will be shrunk afterwards. +bool MemAccessShrinkingPass::matchInstsToShrink( + Value *Val, BitRange &BR, SmallVectorImpl &Insts) { + unsigned BitSize = DL->getTypeSizeInBits(Val->getType()); + + // Match the pattern: + // Bop(...Bop(V, Cst_1), Cst_2, ..., Cst_N), Bop can be Add/Sub/And/Or/Xor. + // Those BinaryOperator instructions are easy to be shrunk to BR range. + Value *Opd; + ConstantInt *Cst; + while (match(Val, m_Add(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_Sub(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_And(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_Or(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_Xor(m_Value(Opd), m_ConstantInt(Cst)))) { + Instruction *BOP = cast(Val); + addSaveCandidate(*BOP); + Insts.push_back(BOP); + Val = Opd; + } + + LoadInst *LI; + if ((LI = dyn_cast(Val))) { + addSaveCandidate(*LI); + Insts.push_back(LI); + return true; + } + + // Match the pattern: + // or(and(...or(and(LI, Cst_1), SmallVal_1), ..., Cse_N), SmallVal_N). + // Analyze the range of LargeVal which may be modified by the current + // or(and(LargeVal, Cst), SmallVal)) operations. If each or(and(...)) + // pair modifies a BitRange which is disjoint with current BR, we can + // skip all these operations when we shrink the final load, so we only + // have to add the final load into Insts. + Value *LargeVal; + Value *SmallVal; + BitRange OtherBR; + while (match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)), + m_Value(SmallVal))) || + (match(Val, m_Or(m_And(m_Value(LargeVal), m_ConstantInt(Cst)), + m_Value(SmallVal))) && + match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) || + (match(Val, m_Or(m_Value(SmallVal), + m_And(m_Value(LargeVal), m_ConstantInt(Cst)))) && + match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value())))) { + addSaveCandidate(*cast(Val)); + bool DoReplacement; + OtherBR.Shift = 0; + OtherBR.Width = BitSize; + // Analyze BitRange of current or(and(LargeVal, Cst), SmallVal) operations. + OtherBR = analyzeOrAndPattern(*SmallVal, *Cst, BitSize, DoReplacement); + // If OtherBR is not disjoint with BR, we cannot shrink load to its BR + // portion. + if (!BR.isDisjoint(OtherBR)) + return false; + if (LI) { + Insts.push_back(LI); + return true; + } + Val = LargeVal; + } + + return false; +} + +/// Find the next MemoryAccess after LI in the same BB. +MemoryAccess *findNextMemoryAccess(LoadInst &LI, MemorySSA &MSSA) { + for (auto Scan = LI.getIterator(); Scan != LI.getParent()->end(); ++Scan) { + if (MemoryAccess *MA = MSSA.getMemoryAccess(&*Scan)) + return MA; + } + return nullptr; +} + +/// Shrink \p Insts according to the range BR. +Value *MemAccessShrinkingPass::shrinkInsts( + Value *NewPtr, BitRange &BR, SmallVectorImpl &Insts, + bool AllClobberInToBlock, unsigned ResShift, IRBuilder<> &Builder) { + Value *NewVal; + IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width); + Instruction &InsertPt = *Builder.GetInsertPoint(); + for (Instruction *I : make_range(Insts.rbegin(), Insts.rend())) { + if (LoadInst *LI = dyn_cast(I)) { + unsigned BitSize = DL->getTypeSizeInBits(LI->getType()); + unsigned LoadByteOffset = computeByteOffset(BR, BitSize, *DL); + unsigned Align = MinAlign(LoadByteOffset, LI->getAlignment()); + Instruction &NewInsertPt = *(InsertPt.getParent()->getFirstInsertionPt()); + if (AllClobberInToBlock && (&InsertPt != &NewInsertPt)) { + // If we use a new insertion point, we have to recreate the NewPtr in + // the new place. + Builder.SetInsertPoint(&NewInsertPt); + RecursivelyDeleteTriviallyDeadInstructions(NewPtr); + NewPtr = createNewPtr(LI->getPointerOperand(), LoadByteOffset, NewTy, + Builder); + } + // Create shrunk load. + LoadInst *NewLI = cast(LI->clone()); + NewLI->setOperand(0, NewPtr); + NewLI->mutateType(NewTy); + NewLI->setAlignment(Align); + Builder.Insert(NewLI, "load.trunc"); + NewVal = NewLI; + // Update MemorySSA. + MemoryUseOrDef *OldMemAcc = + cast(MSSA->getMemoryAccess(LI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + // Find a proper position to insert the new MemoryAccess. + MemoryAccess *Next = findNextMemoryAccess(*NewLI, *MSSA); + if (Next) + MSSAUpdater->createMemoryAccessBefore(NewLI, Def, + cast(Next)); + else + MSSAUpdater->createMemoryAccessInBB(NewLI, Def, NewLI->getParent(), + MemorySSA::End); + } else { + // shrink Add/Sub/And/Xor/Or in Insts. + BinaryOperator *BOP = cast(I); + ConstantInt *Cst = cast(BOP->getOperand(1)); + APInt NewAPInt = Cst->getValue().extractBits(BR.Width, BR.Shift); + ConstantInt *NewCst = + ConstantInt::get(*Context, NewAPInt.zextOrTrunc(BR.Width)); + NewVal = Builder.CreateBinOp(BOP->getOpcode(), NewVal, NewCst); + } + } + // Adjust the type to be consistent with the input instruction. + IntegerType *UseType = cast(InsertPt.getType()); + if (DL->getTypeSizeInBits(UseType) != BR.Width) + NewVal = Builder.CreateZExt(NewVal, UseType, "trunc.zext"); + // Adjust the NewVal with ResShift. + if (ResShift) { + ConstantInt *NewCst = ConstantInt::get(UseType, ResShift); + NewVal = Builder.CreateShl(NewVal, NewCst, "shl"); + } + return NewVal; +} + +/// Determine whether \p Inst can be counted as a saved instruction when we +/// are evaluate the benefit. If \p ReplaceAllUses is true, it means we are +/// going to replace all the uses of \p Inst with the shrunk result, so it +/// can still be counted as a saved instruction even if it has multiple uses. +void MemAccessShrinkingPass::addSaveCandidate(Instruction &Inst, + bool ReplaceAllUses) { + if (!MultiUsesSeen) { + // If the Inst has multiple uses and the current shrinking cannot replace + // all its uses, it will not regarded as a SavedInst. + if (ReplaceAllUses || Inst.hasOneUse()) + SavedInsts += (!isa(Inst) && !isa(Inst)); + else + MultiUsesSeen = true; + } +} + +/// Compare \p SavedInsts with instructions we are about to create and decide +/// whether it is beneficial to do the shrinking. +bool MemAccessShrinkingPass::isBeneficial( + unsigned ResShift, SmallVectorImpl &Insts) { + unsigned InstsToCreate = 0; + if (ResShift) + InstsToCreate++; + InstsToCreate += Insts.size(); + return SavedInsts >= InstsToCreate; +} + +/// When the input instruction \p IN is and(Val, Cst) or trunc, it indicates +/// only a portion of its input value has been used. We will walk through the +/// Def-Use chain, track the range of value which will be used, remember the +/// operations contributing to the used value range, and skip operations which +/// changes value range that is not to be used, until a load is found. +/// +/// So we start from and or trunc operations, then try to find a sequence of +/// shl or lshr operations. Those shifts are only changing the valid range +/// of input value. +/// +/// After we know the valid range of input value, we will collect the sequence +/// of binary operators, which we want to shrink their input to the same range. +/// To make it simpler, we requires one of the operands of any binary operator +/// has to be constant. +/// +/// Then we look for the pattern of or(and(LargeVal, Cst), SmallVal), which +/// will only change a portion of the input LargeVal and keep the rest of it +/// the same. If the operations pattern doesn't change the valid range, we can +/// use LargeVal as input and skip all the operations here. +/// +/// In the end, we look for a load which can provide the valid range directly +/// after shifting the address and shrinking the size of the load. +/// +/// An example: +/// and(lshr(add(or(and(load P, -65536), SmallVal), 0x1000000000000), 48), +/// 255) +/// +/// The actual valid range of the load is [48, 56). The value in the range is +/// incremented by 1. The sequence above will be transformed to: +/// zext(add(load_i8(P + 48), 1), 64) +/// +bool MemAccessShrinkingPass::tryShrinkLoadUse(Instruction &IN) { + // If the instruction is actually dead, skip it. + if (IN.use_empty()) + return false; + + Type *Ty = IN.getType(); + if (Ty->isVectorTy() || !Ty->isIntegerTy()) + return false; + + MultiUsesSeen = false; + SavedInsts = 0; + + // Match and(Val, Cst) or Trunc(Val) + Value *Val; + ConstantInt *AndCst = nullptr; + if (!match(&IN, m_Trunc(m_Value(Val))) && + !match(&IN, m_And(m_Value(Val), m_ConstantInt(AndCst)))) + return false; + addSaveCandidate(IN, true); + + // Get the initial BR showing the range of Val to be used. + BitRange BR; + BR.Shift = 0; + unsigned ResShift = 0; + unsigned BitSize = DL->getTypeSizeInBits(Val->getType()); + if (BitSize % 8 != 0) + return false; + if (AndCst) { + APInt AI = AndCst->getValue(); + // Check AI has consecutive one bits. The consecutive bits are the + // range to be used. If the num of trailing zeros are not zero, + // remember the num in ResShift and the val after shrinking needs + // to be shifted accordingly. + BR.Shift = AI.countTrailingZeros(); + ResShift = BR.Shift; + if (!(AI.lshr(BR.Shift) + 1).isPowerOf2() || BR.Shift == BitSize) + return false; + BR.Width = AI.getActiveBits() - BR.Shift; + } else { + BR.Width = DL->getTypeSizeInBits(Ty); + } + + // Match a series of LShr or Shl. Adjust BR.Shift accordingly. + // Note we have to be careful that valid bits may be cleared during + // back-and-force shifts. We use an all-one APInt Mask to simulate + // the shifts and track the valid bits. + Value *Opd; + unsigned NewShift = BR.Shift; + APInt Mask(BitSize, -1); + ConstantInt *Cst; + // Record the series of LShr or Shl in ShiftRecs. + SmallVector, 4> ShiftRecs; + bool isLShr; + while ((isLShr = match(Val, m_LShr(m_Value(Opd), m_ConstantInt(Cst)))) || + match(Val, m_Shl(m_Value(Opd), m_ConstantInt(Cst)))) { + addSaveCandidate(*cast(Val)); + NewShift = isLShr ? (NewShift + Cst->getZExtValue()) + : (NewShift - Cst->getZExtValue()); + ShiftRecs.push_back({isLShr, Cst->getZExtValue()}); + Val = Opd; + } + // Iterate ShiftRecs in reverse order. Simulate the shifts of Mask. + for (auto SR : make_range(ShiftRecs.rbegin(), ShiftRecs.rend())) { + if (SR.first) + Mask = Mask.lshr(SR.second); + else + Mask = Mask.shl(SR.second); + } + // Mask contains a consecutive run of 1s. If the range BR of Mask are all + // 1s, BR is valid after the shifts. + if (!isRangeofAIAllOnes(BR, Mask)) + return false; + BR.Shift = NewShift; + + // Specify the proper BR we want to handle. + if (BR.Shift + BR.Width > BitSize) + return false; + if (BR.Width != 8 && BR.Width != 16 && BR.Width != 32 && BR.Width != 64) + return false; + if (BR.Shift % 8 != 0) + return false; + + // Match other Binary operators like Add/Sub/And/Xor/Or or pattern like + // And(Or(...And(Or(LargeVal, Cst), SmallVal))) and find the final load. + SmallVector Insts; + if (!matchInstsToShrink(Val, BR, Insts)) + return false; + + // Expect the final load has been found here. + assert(!Insts.empty() && "Expect Insts to be not empty"); + LoadInst *LI = dyn_cast(Insts.back()); + assert(LI && "Last elem in Insts should be a LoadInst"); + if (LI->isVolatile() || !LI->isUnordered()) + return false; + + IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width); + if (TLI->isNarrowingExpensive(EVT::getEVT(Ty), EVT::getEVT(NewTy))) + return false; + + // Do legality check to ensure the range of shrunk load is not clobbered + // from *LI to IN. + IRBuilder<> Builder(*Context); + Builder.SetInsertPoint(&IN); + unsigned LoadByteOffset = computeByteOffset(BR, BitSize, *DL); + Value *NewPtr = + createNewPtr(LI->getPointerOperand(), LoadByteOffset, NewTy, Builder); + // There are clobbers and all the clobbers are in the same block as IN. + bool AllClobberInToBlock; + if (hasClobberBetween(*LI, IN, MemoryLocation(NewPtr, BR.Width / 8), + AllClobberInToBlock)) { + if (!AllClobberInToBlock) { + RecursivelyDeleteTriviallyDeadInstructions(NewPtr); + return false; + } + } + if (!isBeneficial(ResShift, Insts)) { + RecursivelyDeleteTriviallyDeadInstructions(NewPtr); + return false; + } + + // Do the shrinking transformation. + Value *NewVal = + shrinkInsts(NewPtr, BR, Insts, AllClobberInToBlock, ResShift, Builder); + DEBUG(dbgs() << "MemShrink: Replace" << IN << " with" << *NewVal << "\n"); + IN.replaceAllUsesWith(NewVal); + markCandForDeletion(IN); + NumLoadShrunk++; + return true; +} + +/// Check insts in CandidatesToErase. If they are dead, remove the dead +/// instructions recursively and clean up Memory SSA. +bool MemAccessShrinkingPass::removeDeadInsts() { + bool Changed = false; + + for (Instruction *I : CandidatesToErase) { + RecursivelyDeleteTriviallyDeadInstructions(I, TLInfo, MSSAUpdater.get()); + Changed = true; + } + CandidatesToErase.clear(); + return Changed; +} + +/// Try memory access shrinking on Function \Fn. +bool MemAccessShrinkingPass::tryShrink(Function &Fn) { + bool MadeChange = false; + for (BasicBlock *BB : post_order(&Fn)) { + for (auto InstI = BB->rbegin(), InstE = BB->rend(); InstI != InstE;) { + Instruction &Inst = *InstI++; + if (StoreInst *SI = dyn_cast(&Inst)) { + if (tryShrinkStoreBySplit(*SI)) { + MadeChange = true; + continue; + } + MadeChange |= tryShrinkStoreToLegalTy(*SI); + continue; + } + if (isa(&Inst) || isa(&Inst)) + MadeChange |= tryShrinkLoadUse(Inst); + } + } + return MadeChange; +} + +bool MemAccessShrinkingPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) + return false; + + DL = &Fn.getParent()->getDataLayout(); + DT = &getAnalysis().getDomTree(); + TLInfo = &getAnalysis().getTLI(); + MSSA = &getAnalysis().getMSSA(); + MSSAWalker = MSSA->getWalker(); + MSSAUpdater = make_unique(MSSA); + CandidatesToErase.clear(); + + Context = &Fn.getContext(); + + if (TM) + TLI = TM->getSubtargetImpl(Fn)->getTargetLowering(); + + // Iterates the Fn until nothing is changed. + MSSA->verifyMemorySSA(); + bool MadeChange = false; + while (true) { + if (!tryShrink(Fn) && !removeDeadInsts()) + break; + MadeChange = true; + } + + return MadeChange; +} Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -68,6 +68,9 @@ cl::desc("Disable Machine LICM")); static cl::opt DisableMachineSink("disable-machine-sink", cl::Hidden, cl::desc("Disable Machine Sinking")); +static cl::opt + DisableMemAccessShrinking("disable-memaccess-shrinking", cl::Hidden, + cl::desc("Disable MemAccessShrinking")); static cl::opt DisableLSR("disable-lsr", cl::Hidden, cl::desc("Disable Loop Strength Reduction Pass")); static cl::opt DisableConstantHoisting("disable-constant-hoisting", @@ -482,6 +485,9 @@ if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting) addPass(createConstantHoistingPass()); + if (getOptLevel() != CodeGenOpt::None && !DisableMemAccessShrinking) + addPass(createMemAccessShrinkingPass(TM)); + if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining) addPass(createPartiallyInlineLibCallsPass()); Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -141,6 +141,7 @@ bool isZExtFree(SDValue Val, EVT VT2) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + bool isNarrowingExpensive(EVT VT1, EVT VT2) const override; MVT getVectorIdxTy(const DataLayout &) const override; bool isSelectSupported(SelectSupportKind) const override; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -761,6 +761,21 @@ return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; } +bool AMDGPUTargetLowering::isNarrowingExpensive(EVT SrcVT, EVT DestVT) const { + // If we are reducing to a 32-bit load, this is always better. + if (DestVT.getStoreSizeInBits() == 32) + return true; + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (SrcVT.getStoreSizeInBits() < 32); +} + //===---------------------------------------------------------------------===// // TargetLowering Callbacks //===---------------------------------------------------------------------===// Index: lib/Transforms/Utils/Local.cpp =================================================================== --- lib/Transforms/Utils/Local.cpp +++ lib/Transforms/Utils/Local.cpp @@ -364,9 +364,8 @@ /// trivially dead instruction, delete it. If that makes any of its operands /// trivially dead, delete them too, recursively. Return true if any /// instructions were deleted. -bool -llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V, - const TargetLibraryInfo *TLI) { +bool llvm::RecursivelyDeleteTriviallyDeadInstructions( + Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAUpdater) { Instruction *I = dyn_cast(V); if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI)) return false; @@ -393,6 +392,11 @@ DeadInsts.push_back(OpI); } + // If MemorySSA is used, update it when we are about to erase a memory + // access instruction. + if (MSSAUpdater) + MSSAUpdater->removeMemoryAccess(I); + I->eraseFromParent(); } while (!DeadInsts.empty()); Index: test/CodeGen/ARM/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -12,13 +12,9 @@ ; ; BE-LABEL: i24_or: ; BE: @ BB#0: -; BE-NEXT: ldrh r1, [r0] -; BE-NEXT: ldrb r2, [r0, #2] -; BE-NEXT: orr r1, r2, r1, lsl #8 +; BE-NEXT: ldrh r1, [r0, #1] ; BE-NEXT: orr r1, r1, #384 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] +; BE-NEXT: strh r1, [r0, #1] ; BE-NEXT: mov pc, lr %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -39,11 +35,12 @@ ; ; BE-LABEL: i24_and_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, #128 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: ldrh r1, [r0] -; BE-NEXT: orr r1, r1, #1 -; BE-NEXT: strh r1, [r0] +; BE-NEXT: ldrh r1, [r0, #1] +; BE-NEXT: mov r2, #16256 +; BE-NEXT: orr r2, r2, #49152 +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: and r1, r1, r2 +; BE-NEXT: strh r1, [r0, #1] ; BE-NEXT: mov pc, lr %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 @@ -55,23 +52,18 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; LE-LABEL: i24_insert_bit: ; LE: @ BB#0: -; LE-NEXT: ldrh r2, [r0] -; LE-NEXT: mov r3, #255 -; LE-NEXT: orr r3, r3, #57088 -; LE-NEXT: and r2, r2, r3 -; LE-NEXT: orr r1, r2, r1, lsl #13 -; LE-NEXT: strh r1, [r0] +; LE-NEXT: ldrb r2, [r0, #1] +; LE-NEXT: and r2, r2, #223 +; LE-NEXT: orr r1, r2, r1, lsl #5 +; LE-NEXT: strb r1, [r0, #1] ; LE-NEXT: mov pc, lr ; ; BE-LABEL: i24_insert_bit: ; BE: @ BB#0: -; BE-NEXT: ldrh r2, [r0] -; BE-NEXT: mov r3, #57088 -; BE-NEXT: orr r3, r3, #16711680 -; BE-NEXT: and r2, r3, r2, lsl #8 -; BE-NEXT: orr r1, r2, r1, lsl #13 -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] +; BE-NEXT: ldrb r2, [r0, #1] +; BE-NEXT: and r2, r2, #223 +; BE-NEXT: orr r1, r2, r1, lsl #5 +; BE-NEXT: strb r1, [r0, #1] ; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 @@ -92,19 +84,9 @@ ; ; BE-LABEL: i56_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: ldrb r3, [r1, #2] -; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldr r1, [r0, #3] +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: str r1, [r0, #3] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a %b = or i56 %aa, 384 @@ -123,19 +105,10 @@ ; ; BE-LABEL: i56_and_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: mov r3, #128 -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: strb r3, [r1, #2] -; BE-NEXT: lsl r2, r2, #8 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldr r1, [r0, #3] +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: bic r1, r1, #127 +; BE-NEXT: str r1, [r0, #3] ; BE-NEXT: mov pc, lr %b = load i56, i56* %a, align 1 @@ -148,30 +121,18 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; LE-LABEL: i56_insert_bit: ; LE: @ BB#0: -; LE-NEXT: ldr r2, [r0] -; LE-NEXT: bic r2, r2, #8192 -; LE-NEXT: orr r1, r2, r1, lsl #13 -; LE-NEXT: str r1, [r0] +; LE-NEXT: ldr r2, [r0, #1] +; LE-NEXT: bic r2, r2, #32 +; LE-NEXT: orr r1, r2, r1, lsl #5 +; LE-NEXT: str r1, [r0, #1] ; LE-NEXT: mov pc, lr ; ; BE-LABEL: i56_insert_bit: ; BE: @ BB#0: -; BE-NEXT: .save {r11, lr} -; BE-NEXT: push {r11, lr} -; BE-NEXT: mov r2, r0 -; BE-NEXT: ldr lr, [r0] -; BE-NEXT: ldrh r12, [r2, #4]! -; BE-NEXT: ldrb r3, [r2, #2] -; BE-NEXT: orr r12, r3, r12, lsl #8 -; BE-NEXT: orr r3, r12, lr, lsl #24 -; BE-NEXT: bic r3, r3, #8192 -; BE-NEXT: orr r1, r3, r1, lsl #13 -; BE-NEXT: lsr r3, r1, #8 -; BE-NEXT: strh r3, [r2] -; BE-NEXT: bic r2, lr, #255 -; BE-NEXT: orr r1, r2, r1, lsr #24 -; BE-NEXT: str r1, [r0] -; BE-NEXT: pop {r11, lr} +; BE-NEXT: ldr r2, [r0, #2] +; BE-NEXT: bic r2, r2, #32 +; BE-NEXT: orr r1, r2, r1, lsl #5 +; BE-NEXT: str r1, [r0, #2] ; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 Index: test/CodeGen/ARM/load-shrink.ll =================================================================== --- test/CodeGen/ARM/load-shrink.ll +++ test/CodeGen/ARM/load-shrink.ll @@ -0,0 +1,111 @@ +; RUN: opt < %s -mtriple=arm-eabi -memaccessshrink -S | FileCheck %s +; Check bitfield load is shrinked properly in cases below. + +target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" + +; The bitfield store can be shrinked from i64 to i16. +; CHECK-LABEL: @load_and( +; CHECK: %cast = bitcast i64* %ptr to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 8 +; CHECK: %trunc.zext = zext i16 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = and i64 %bf.load, 65535 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_trunc( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %load.trunc = load i8, i8* %cast, align 8 +; CHECK: %cmp = icmp eq i8 %load.trunc, 1 + +define i1 @load_trunc(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = trunc i64 %bf.load to i8 + %cmp = icmp eq i8 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %trunc.zext = zext i8 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.lshr = lshr i64 %bf.load, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr_add( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr_add(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %add = add i64 %bf.load, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_ops( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_ops(i64* %ptr, i64 %value) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.value = and i64 %value, 65535 + %bf.clear1 = and i64 %bf.load, -65536 + %bf.set = or i64 %bf.value, %bf.clear1 + %add = add i64 %bf.set, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; It doesn't worth to do the shrink because %bf.load has multiple uses +; and the shrink here doesn't save instructions. +; CHECK-LABEL: @load_trunc_multiuses +; CHECK: %bf.load = load i64, i64* %ptr, align 8 +; CHECK: %bf.trunc = trunc i64 %bf.load to i16 +; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3 +; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000 +; CHECK: %cmp = and i1 %cmp1, %cmp2 + +define i1 @load_trunc_multiuses(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.trunc = trunc i64 %bf.load to i16 + %cmp1 = icmp eq i16 %bf.trunc, 3 + %cmp2 = icmp ult i64 %bf.load, 1500000 + %cmp = and i1 %cmp1, %cmp2 + ret i1 %cmp +} + Index: test/CodeGen/ARM/store-shrink.ll =================================================================== --- test/CodeGen/ARM/store-shrink.ll +++ test/CodeGen/ARM/store-shrink.ll @@ -0,0 +1,366 @@ +; RUN: opt < %s -mtriple=arm-eabi -memaccessshrink -S | FileCheck %s +; Check bitfield store is shrinked properly in cases below. + +target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32" + +; class A1 { +; unsigned long f1:8; +; unsigned long f2:3; +; } a1; +; a1.f1 = n; +; +; The bitfield store can be shrinked from i16 to i8. +; CHECK-LABEL: @test1( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %t0 = trunc i64 %conv to i16 +; CHECK: %bf.value = and i16 %t0, 255 +; CHECK: %trunc = trunc i16 %bf.value to i8 +; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8 + +%class.A1 = type { i16, [6 x i8] } +@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8 + +define void @test1(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %t0 = trunc i64 %conv to i16 + %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + %bf.value = and i16 %t0, 255 + %bf.clear = and i16 %bf.load, -256 + %bf.set = or i16 %bf.clear, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + ret void +} + +; class A2 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a2; +; a2.f1 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test2( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8 + +%class.A2 = type { i24, [4 x i8] } +@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8 + +define void @test2(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.clear = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear, %bf.value + store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8 + ret void +} + +; class A3 { +; unsigned long f1:32; +; unsigned long f2:3; +; } a3; +; a3.f1 = n; +; The bitfield store can be shrinked from i64 to i32. +; CHECK-LABEL: @test3( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %bf.value = and i64 %conv, 4294967295 +; CHECK: %trunc = trunc i64 %bf.value to i32 +; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8 + +%class.A3 = type { i40 } +@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8 + +define void @test3(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8 + %bf.value = and i64 %conv, 4294967295 + %bf.clear = and i64 %bf.load, -4294967296 + %bf.set = or i64 %bf.clear, %bf.value + store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8 + ret void +} + +; class A4 { +; unsigned long f1:13; +; unsigned long f2:3; +; } a4; +; a4.f1 = n; +; The bitfield store cannot be shrinked because the field is not 8/16/32 bits. +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: %t0 = trunc i32 %n to i16 +; CHECK-NEXT: %bf.value = and i16 %t0, 8191 +; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192 +; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value +; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: ret void + +%class.A4 = type { i16, [6 x i8] } +@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8 + +define void @test4(i32 %n) { +entry: + %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + %t0 = trunc i32 %n to i16 + %bf.value = and i16 %t0, 8191 + %bf.clear3 = and i16 %bf.load, -8192 + %bf.set = or i16 %bf.clear3, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + ret void +} + +; class A5 { +; unsigned long f1:3; +; unsigned long f2:16; +; } a5; +; a5.f2 = n; +; The bitfield store cannot be shrinked because it is not aligned on +; 16bits boundary. +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: %bf.value = and i32 %n, 65535 +; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3 +; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281 +; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl +; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: ret void + +%class.A5 = type { i24, [4 x i8] } +@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8 + +define void @test5(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 3 + %bf.clear = and i32 %bf.load, -524281 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 + ret void +} + +; class A6 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a6; +; a6.f1 = n; +; The bitfield store can be shrinked from i32 to i16 even the load and store +; are in different BasicBlocks. +; CHECK-LABEL: @test6( +; CHECK: if.end: +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8 + +%class.A6 = type { i24, [4 x i8] } +@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8 + +define void @test6(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8 + %bf.clear = and i32 %bf.load, 65535 + %cmp = icmp eq i32 %bf.clear, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %bf.value = and i32 %n, 65535 + %bf.clear3 = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear3, %bf.value + store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; class A7 { +; unsigned long f1:16; +; unsigned long f2:16; +; } a7; +; a7.f2 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test7( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %bf.shl = shl i32 %bf.value, 16 +; CHECK: %lshr = lshr i32 %bf.shl, 16 +; CHECK: %trunc = trunc i32 %lshr to i16 +; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2 + +%class.A7 = type { i32, [4 x i8] } +@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8 + +define void @test7(i32 %n) { +entry: + %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 16 + %bf.clear = and i32 %bf.load, 65535 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_or(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i8. +; CHECK-LABEL: @i24_and( +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -7 +; CHECK: store i8 %and.trunc, i8* %uglygep, align 1 +; +define void @i24_and(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = and i24 %aa, -1537 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_xor( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %xor.trunc = xor i16 %load.trunc, 384 +; CHECK: store i16 %xor.trunc, i16* %cast, align 1 +; +define void @i24_xor(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = xor i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_and_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_and_or(i24* %a) { + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i24 store to i8. +; CHECK-LABEL: @i24_insert_bit( +; CHECK: %extbit = zext i1 %bit to i24 +; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13 +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %lshr = lshr i24 %extbit.shl, 8 +; CHECK: %trunc = trunc i24 %lshr to i8 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or( +; CHECK: %cast = bitcast i56* %a to i32* +; CHECK: %load.trunc = load i32, i32* %cast, align 1 +; CHECK: %or.trunc = or i32 %load.trunc, 384 +; CHECK: store i32 %or.trunc, i32* %cast, align 1 +; +define void @i56_or(i56* %a) { + %aa = load i56, i56* %a, align 1 + %b = or i56 %aa, 384 + store i56 %b, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_and_or( +; CHECK: %cast = bitcast i56* %a to i32* +; CHECK: %load.trunc = load i32, i32* %cast, align 1 +; CHECK: %and.trunc = and i32 %load.trunc, -128 +; CHECK: %or.trunc = or i32 %and.trunc, 384 +; CHECK: store i32 %or.trunc, i32* %cast, align 1 +; +define void @i56_and_or(i56* %a) { + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store to i8. +; CHECK-LABEL: @i56_insert_bit( +; CHECK: %extbit = zext i1 %bit to i56 +; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13 +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %load.trunc = load i32, i32* %cast1, align 1 +; CHECK: %lshr = lshr i56 %extbit.shl, 8 +; CHECK: %trunc = trunc i56 %lshr to i32 +; CHECK: %and.trunc = and i32 %load.trunc, -33 +; CHECK: %or.trunc = or i32 %and.trunc, %trunc +; CHECK: store i32 %or.trunc, i32* %cast1, align 1 +; +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or_alg2( +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %load.trunc = load i32, i32* %cast1, align 2 +; CHECK: %or.trunc = or i32 %load.trunc, 272 +; CHECK: store i32 %or.trunc, i32* %cast1, align 2 +; +define void @i56_or_alg2(i56* %a) { + %aa = load i56, i56* %a, align 2 + %b = or i56 %aa, 17825792 + store i56 %b, i56* %a, align 2 + ret void +} + + Index: test/CodeGen/X86/2008-09-11-CoalescerBug2.ll =================================================================== --- test/CodeGen/X86/2008-09-11-CoalescerBug2.ll +++ test/CodeGen/X86/2008-09-11-CoalescerBug2.ll @@ -13,9 +13,9 @@ ; SOURCE-SCHED: xorl ; SOURCE-SCHED: cmpl ; SOURCE-SCHED: setg -; SOURCE-SCHED: movb ; SOURCE-SCHED: xorl ; SOURCE-SCHED: subl +; SOURCE-SCHED: movb ; SOURCE-SCHED: testb ; SOURCE-SCHED: jne %0 = load i32, i32* @g_5, align 4 ; [#uses=1] Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc -disable-memaccess-shrinking < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" Index: test/CodeGen/X86/i16lshr8pat.ll =================================================================== --- test/CodeGen/X86/i16lshr8pat.ll +++ test/CodeGen/X86/i16lshr8pat.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s +; RUN: llc -march=x86 -disable-memaccess-shrinking -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -4,13 +4,7 @@ define void @i24_or(i24* %a) { ; CHECK-LABEL: i24_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: orw $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -22,13 +16,9 @@ ; CHECK-LABEL: i24_and_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 @@ -40,16 +30,11 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; CHECK-LABEL: i24_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl (%rdi), %ecx -; CHECK-NEXT: movzbl 2(%rdi), %edx -; CHECK-NEXT: movb %dl, 2(%rdi) -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shll $13, %eax -; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF -; CHECK-NEXT: orl %eax, %edx -; CHECK-NEXT: movw %dx, (%rdi) +; CHECK-NEXT: movb 1(%rdi), %al +; CHECK-NEXT: shlb $5, %sil +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: movb %al, 1(%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 @@ -63,19 +48,7 @@ define void @i56_or(i56* %a) { ; CHECK-LABEL: i56_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movl %edx, (%rdi) -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: movw %dx, 4(%rdi) +; CHECK-NEXT: orw $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 @@ -86,21 +59,10 @@ define void @i56_and_or(i56* %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80 -; CHECK-NEXT: andq %rdx, %rax -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i56, i56* %a, align 1 %c = and i56 %b, -128 @@ -112,23 +74,11 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl 4(%rdi), %ecx -; CHECK-NEXT: movzbl 6(%rdi), %edx -; CHECK-NEXT: movl (%rdi), %esi -; CHECK-NEXT: movb %dl, 6(%rdi) -; CHECK-NEXT: # kill: %EDX %EDX %RDX %RDX -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shlq $32, %rdx -; CHECK-NEXT: orq %rdx, %rsi -; CHECK-NEXT: shlq $13, %rax -; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: movb 1(%rdi), %al +; CHECK-NEXT: shlb $5, %sil +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: movb %al, 1(%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 Index: test/CodeGen/X86/load-shrink.ll =================================================================== --- test/CodeGen/X86/load-shrink.ll +++ test/CodeGen/X86/load-shrink.ll @@ -0,0 +1,111 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check bitfield load is shrinked properly in cases below. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; The bitfield store can be shrinked from i64 to i16. +; CHECK-LABEL: @load_and( +; CHECK: %cast = bitcast i64* %ptr to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 8 +; CHECK: %trunc.zext = zext i16 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = and i64 %bf.load, 65535 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_trunc( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %load.trunc = load i8, i8* %cast, align 8 +; CHECK: %cmp = icmp eq i8 %load.trunc, 1 + +define i1 @load_trunc(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = trunc i64 %bf.load to i8 + %cmp = icmp eq i8 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %trunc.zext = zext i8 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.lshr = lshr i64 %bf.load, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr_add( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr_add(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %add = add i64 %bf.load, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_ops( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_ops(i64* %ptr, i64 %value) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.value = and i64 %value, 65535 + %bf.clear1 = and i64 %bf.load, -65536 + %bf.set = or i64 %bf.value, %bf.clear1 + %add = add i64 %bf.set, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; It doesn't worth to do the shrink because %bf.load has multiple uses +; and the shrink here doesn't save instructions. +; CHECK-LABEL: @load_trunc_multiuses +; CHECK: %bf.load = load i64, i64* %ptr, align 8 +; CHECK: %bf.trunc = trunc i64 %bf.load to i16 +; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3 +; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000 +; CHECK: %cmp = and i1 %cmp1, %cmp2 + +define i1 @load_trunc_multiuses(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.trunc = trunc i64 %bf.load to i16 + %cmp1 = icmp eq i16 %bf.trunc, 3 + %cmp2 = icmp ult i64 %bf.load, 1500000 + %cmp = and i1 %cmp1, %cmp2 + ret i1 %cmp +} + Index: test/CodeGen/X86/load-slice.ll =================================================================== --- test/CodeGen/X86/load-slice.ll +++ test/CodeGen/X86/load-slice.ll @@ -110,20 +110,23 @@ ret i32 %res } -; Check that we do not optimize overlapping slices. ; -; The 64-bits should NOT have been split in as slices are overlapping. +; The slices are overlapping, but it can still be split. ; First slice uses bytes numbered 0 to 3. ; Second slice uses bytes numbered 6 and 7. ; Third slice uses bytes numbered 4 to 7. ; ; STRESS-LABEL: t3: -; STRESS: shrq $48 -; STRESS: shrq $32 +; STRESS: movzwl 6(%rdi,%rsi,8), %eax +; STRESS-NEXT: addl (%rdi,%rsi,8), %eax +; STRESS-NEXT: addl 4(%rdi,%rsi,8), %eax ; ; REGULAR-LABEL: t3: -; REGULAR: shrq $48 -; REGULAR: shrq $32 +; REGULAR: movq (%rdi,%rsi,8), %rcx +; REGULAR-NEXT: movq %rcx, %rax +; REGULAR-NEXT: shrq $48, %rax +; REGULAR-NEXT: addl %ecx, %eax +; REGULAR-NEXT: addl 4(%rdi,%rsi,8), %eax define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) { %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start %bitcast = bitcast %class.Complex* %arrayidx to i64* Index: test/CodeGen/X86/lsr-loop-exit-cond.ll =================================================================== --- test/CodeGen/X86/lsr-loop-exit-cond.ll +++ test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -2,14 +2,16 @@ ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; CHECK-LABEL: t: +; CHECK: [[LABEL:LBB.*]]: ; CHECK: movl (%r9,%rax,4), %e{{..}} -; CHECK-NEXT: testq -; CHECK-NEXT: jne +; CHECK: testq +; CHECK: jne [[LABEL]] ; ATOM-LABEL: t: +; ATOM: [[LABEL:LBB.*]]: ; ATOM: movl (%r9,%r{{.+}},4), %e{{..}} -; ATOM-NEXT: testq -; ATOM-NEXT: jne +; ATOM: testq +; ATOM: jne [[LABEL]] @Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5] @Te1 = external global [256 x i32] ; <[256 x i32]*> [#uses=4] Index: test/CodeGen/X86/mem-access-shrink.ll =================================================================== --- test/CodeGen/X86/mem-access-shrink.ll +++ test/CodeGen/X86/mem-access-shrink.ll @@ -0,0 +1,216 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check the memory accesses in the test are shrinked as expected. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%class.B = type { i56, [4 x i16], i64 } +@i = local_unnamed_addr global i64 0, align 8 + +define zeroext i1 @foo(%class.B* nocapture %this, i8* %p1, i64 %p2) local_unnamed_addr align 2 { +; CHECK-LABEL: @foo( +entry: +; CHECK: entry: +; CHECK: %t0 = bitcast %class.B* %this to i64* +; CHECK: %[[CAST1:.*]] = bitcast i64* %t0 to i16* +; CHECK: %[[LOAD1:.*]] = load i16, i16* %[[CAST1]], align 8 +; CHECK: %[[TRUNC1:.*]] = zext i16 %[[LOAD1]] to i64 +; CHECK: %cmp = icmp eq i64 %[[TRUNC1]], 1 + + %t0 = bitcast %class.B* %this to i64* + %bf.load = load i64, i64* %t0, align 8 + %bf.clear = and i64 %bf.load, 65535 + %cmp = icmp eq i64 %bf.clear, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry +; CHECK: if.end: +; CHECK: %[[CAST2:.*]] = bitcast i64* %t0 to i16* +; CHECK: %[[LTRUNC1:.*]] = load i16, i16* %[[CAST2]], align 8 +; CHECK: %[[ADD1:.*]] = add i16 %[[LTRUNC1]], -1 +; CHECK: %[[TRUNCZ1:.*]] = zext i16 %[[ADD1]] to i64 +; CHECK: %[[CAST3:.*]] = bitcast i64* %t0 to i16* +; CHECK: %[[TRUNC2:.*]] = trunc i64 %[[TRUNCZ1]] to i16 +; CHECK: store i16 %[[TRUNC2]], i16* %[[CAST3]], align 8 + + %dec = add i64 %bf.load, 65535 + %bf.value = and i64 %dec, 65535 + %bf.clear5 = and i64 %bf.load, -65536 + %bf.set = or i64 %bf.value, %bf.clear5 + store i64 %bf.set, i64* %t0, align 8 + %t1 = ptrtoint i8* %p1 to i64 + %t2 = load i64, i64* @i, align 8 + %cmp.i = icmp ult i64 %t2, 3 + br i1 %cmp.i, label %if.then.i, label %if.else.i + +if.then.i: ; preds = %if.end + %and.i = lshr i64 %t1, 3 + %div.i = and i64 %and.i, 1023 + br label %_ZNK1B5m_fn3EPv.exit + +if.else.i: ; preds = %if.end + %first_page_.i = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2 + %t3 = load i64, i64* %first_page_.i, align 8 + %mul.i = shl i64 %t3, 13 + %sub.i = sub i64 %t1, %mul.i + %div2.i = lshr i64 %sub.i, 2 + br label %_ZNK1B5m_fn3EPv.exit + +_ZNK1B5m_fn3EPv.exit: ; preds = %if.then.i, %if.else.i +; CHECK: _ZNK1B5m_fn3EPv.exit: +; CHECK: %[[CAST4:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP1:.*]] = getelementptr i8, i8* %[[CAST4]], i32 6 +; CHECK: %[[LTRUNC2:.*]] = load i8, i8* %[[UGEP1]], align 2 +; CHECK: %[[TRUNCZ2:.*]] = zext i8 %[[LTRUNC2]] to i64 +; CHECK: %cmp8 = icmp eq i64 %[[TRUNCZ2]], 4 + + %j.0.i = phi i64 [ %div.i, %if.then.i ], [ %div2.i, %if.else.i ] + %conv.i = trunc i64 %j.0.i to i16 + %bf.lshr = lshr i64 %bf.load, 48 + %bf.clear7 = and i64 %bf.lshr, 255 + %cmp8 = icmp eq i64 %bf.clear7, 4 + br i1 %cmp8, label %if.else, label %if.then9 + +if.then9: ; preds = %_ZNK1B5m_fn3EPv.exit +; CHECK: if.then9: +; CHECK: %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %[[TRUNCZ2]] +; CHECK: store i16 %conv.i, i16* %arrayidx, align 2 +; CHECK: %[[CAST5:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP2:.*]] = getelementptr i8, i8* %[[CAST5]], i32 6 +; CHECK: %[[CAST6:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP3:.*]] = getelementptr i8, i8* %[[CAST6]], i32 6 +; CHECK: %[[LTRUNC3:.*]] = load i8, i8* %[[UGEP3]], align 2 +; CHECK: %[[ADD2:.*]] = add i8 %[[LTRUNC3]], 1 +; CHECK: %[[AND1:.*]] = and i8 %[[ADD2]], -1 +; CHECK: store i8 %[[AND1]], i8* %[[UGEP2]], align 2 + + %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %bf.clear7 + store i16 %conv.i, i16* %arrayidx, align 2 + %inc79 = add i64 %bf.set, 281474976710656 + %bf.shl = and i64 %inc79, 71776119061217280 + %bf.clear18 = and i64 %bf.set, -71776119061217281 + %bf.set19 = or i64 %bf.shl, %bf.clear18 + store i64 %bf.set19, i64* %t0, align 8 + br label %return + +if.else: ; preds = %_ZNK1B5m_fn3EPv.exit +; CHECK: if.else: +; CHECK: %[[CAST7:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP4:.*]] = getelementptr i8, i8* %[[CAST7]], i32 4 +; CHECK: %[[CAST8:.*]] = bitcast i8* %[[UGEP4]] to i16* +; CHECK: %[[LTRUNC4:.*]] = load i16, i16* %[[CAST8]], align 4 +; CHECK: %[[TRUNCZ3:.*]] = zext i16 %[[LTRUNC4]] to i64 +; CHECK: %cmp24 = icmp eq i64 %[[TRUNCZ3]], 1 + + %bf.lshr21 = lshr i64 %bf.load, 32 + %bf.clear22 = and i64 %bf.lshr21, 65535 + %cmp24 = icmp eq i64 %bf.clear22, 1 + br i1 %cmp24, label %if.else55, label %land.lhs.true + +land.lhs.true: ; preds = %if.else +; CHECK: land.lhs.true: +; CHECK: %[[CAST9:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP5:.*]] = getelementptr i8, i8* %[[CAST9]], i32 2 +; CHECK: %[[CAST10:.*]] = bitcast i8* %[[UGEP5]] to i16* +; CHECK: %[[LTRUNC5:.*]] = load i16, i16* %[[CAST10]], align 2 +; CHECK: %[[TRUNCZ4:.*]] = zext i16 %[[LTRUNC5]] to i64 +; CHECK: %cmp28 = icmp eq i64 %[[TRUNCZ4]], %sub + + %bf.lshr26 = lshr i64 %bf.load, 16 + %bf.clear27 = and i64 %bf.lshr26, 65535 + %div = lshr i64 %p2, 1 + %sub = add nsw i64 %div, -1 + %cmp28 = icmp eq i64 %bf.clear27, %sub + br i1 %cmp28, label %if.else55, label %if.then29 + +if.then29: ; preds = %land.lhs.true + %cmp30 = icmp ult i64 %p2, 3 + br i1 %cmp30, label %if.then31, label %if.else35 + +if.then31: ; preds = %if.then29 +; CHECK: if.then31: +; CHECK: %mul = shl nuw nsw i64 %[[TRUNCZ3]], 3 + + %and = and i64 %t1, -8192 + %mul = shl nuw nsw i64 %bf.clear22, 3 + %add = add i64 %mul, %and + br label %if.end41 + +if.else35: ; preds = %if.then29 +; CHECK: if.else35: +; CHECK: %[[CAST11:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP6:.*]] = getelementptr i8, i8* %[[CAST11]], i32 4 +; CHECK: %[[CAST12:.*]] = bitcast i8* %[[UGEP6]] to i16* +; CHECK: %[[LTRUNC6:.*]] = load i16, i16* %[[CAST12]], align 4 +; CHECK: %[[TRUNCZ5:.*]] = zext i16 %[[LTRUNC6]] to i64 +; CHECK: %shl = shl i64 %[[TRUNCZ5]], 6 + + %first_page_.i80 = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2 + %t4 = load i64, i64* %first_page_.i80, align 8 + %mul.i81 = shl i64 %t4, 13 + %conv.i82 = shl nuw nsw i64 %bf.lshr21, 6 + %mul3.i = and i64 %conv.i82, 4194240 + %add.i = add i64 %mul.i81, %mul3.i + br label %if.end41 + +if.end41: ; preds = %if.else35, %if.then31 +; CHECK: if.end41: +; CHECK: %[[CAST17:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP9:.*]] = getelementptr i8, i8* %[[CAST17]], i32 2 +; CHECK: %[[CAST18:.*]] = bitcast i8* %[[UGEP9]] to i16* +; CHECK: %[[LTRUNC8:.*]] = load i16, i16* %[[CAST18]], align 2 +; CHECK: %[[ADD4:.*]] = add i16 %[[LTRUNC8]], 1 +; CHECK: %[[TRUNC3:.*]] = zext i16 %[[ADD4]] to i64 +; CHECK: %[[CAST13:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP7:.*]] = getelementptr i8, i8* %[[CAST13]], i32 2 +; CHECK: %[[CAST14:.*]] = bitcast i8* %[[UGEP7]] to i16* +; CHECK: %[[CAST15:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP8:.*]] = getelementptr i8, i8* %[[CAST15]], i32 2 +; CHECK: %[[CAST16:.*]] = bitcast i8* %[[UGEP8]] to i16* +; CHECK: %[[LTRUNC7:.*]] = load i16, i16* %[[CAST16]], align 2 +; CHECK: %[[ADD3:.*]] = add i16 %[[LTRUNC7]], 1 +; CHECK: %[[AND2:.*]] = and i16 %[[ADD3]], -1 +; CHECK: store i16 %[[AND2]], i16* %[[CAST14]], align 2 +; CHECK: %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %[[TRUNC3]] + + %add.i.sink = phi i64 [ %add.i, %if.else35 ], [ %add, %if.then31 ] + %t5 = inttoptr i64 %add.i.sink to i16* + %inc4578 = add i64 %bf.set, 65536 + %bf.shl48 = and i64 %inc4578, 4294901760 + %bf.clear49 = and i64 %bf.set, -4294901761 + %bf.set50 = or i64 %bf.shl48, %bf.clear49 + store i64 %bf.set50, i64* %t0, align 8 + %bf.lshr52 = lshr i64 %inc4578, 16 + %bf.clear53 = and i64 %bf.lshr52, 65535 + %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %bf.clear53 + store i16 %conv.i, i16* %arrayidx54, align 2 + br label %return + +if.else55: ; preds = %land.lhs.true, %if.else +; CHECK: if.else55: +; CHECK: %[[CAST19:.*]] = bitcast i64* %t0 to i8* +; CHECK: %[[UGEP10:.*]] = getelementptr i8, i8* %[[CAST19]], i32 4 +; CHECK: %[[CAST20:.*]] = bitcast i8* %[[UGEP10]] to i16* +; CHECK: %[[LTRUNC9:.*]] = load i16, i16* %[[CAST20]], align 4 +; CHECK: store i16 %[[LTRUNC9]], i16* %t6, align 2 +; CHECK: %[[UGEP11:.*]] = getelementptr i8, i8* %cast, i32 2 +; CHECK: %[[CAST21:.*]] = bitcast i8* %[[UGEP11]] to i32* +; CHECK: %lshr = lshr i64 %bf.shl63, 16 +; CHECK: %[[TRUNC4:.*]] = trunc i64 %lshr to i32 +; CHECK: store i32 %[[TRUNC4]], i32* %[[CAST21]], align 2 + + %conv59 = trunc i64 %bf.lshr21 to i16 + %t6 = bitcast i8* %p1 to i16* + store i16 %conv59, i16* %t6, align 2 + %bf.load61 = load i64, i64* %t0, align 8 + %conv60 = shl i64 %j.0.i, 32 + %bf.shl63 = and i64 %conv60, 281470681743360 + %bf.clear64 = and i64 %bf.load61, -281474976645121 + %bf.clear67 = or i64 %bf.clear64, %bf.shl63 + store i64 %bf.clear67, i64* %t0, align 8 + br label %return + +return: ; preds = %if.then9, %if.else55, %if.end41, %entry + %retval.0 = phi i1 [ false, %entry ], [ true, %if.end41 ], [ true, %if.else55 ], [ true, %if.then9 ] + ret i1 %retval.0 +} + Index: test/CodeGen/X86/store-shrink.ll =================================================================== --- test/CodeGen/X86/store-shrink.ll +++ test/CodeGen/X86/store-shrink.ll @@ -0,0 +1,365 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check bitfield store is shrinked properly in cases below. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; class A1 { +; unsigned long f1:8; +; unsigned long f2:3; +; } a1; +; a1.f1 = n; +; +; The bitfield store can be shrinked from i16 to i8. +; CHECK-LABEL: @test1( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %t0 = trunc i64 %conv to i16 +; CHECK: %bf.value = and i16 %t0, 255 +; CHECK: %trunc = trunc i16 %bf.value to i8 +; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8 + +%class.A1 = type { i16, [6 x i8] } +@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8 + +define void @test1(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %t0 = trunc i64 %conv to i16 + %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + %bf.value = and i16 %t0, 255 + %bf.clear = and i16 %bf.load, -256 + %bf.set = or i16 %bf.clear, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + ret void +} + +; class A2 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a2; +; a2.f1 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test2( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8 + +%class.A2 = type { i24, [4 x i8] } +@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8 + +define void @test2(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.clear = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear, %bf.value + store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8 + ret void +} + +; class A3 { +; unsigned long f1:32; +; unsigned long f2:3; +; } a3; +; a3.f1 = n; +; The bitfield store can be shrinked from i64 to i32. +; CHECK-LABEL: @test3( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %bf.value = and i64 %conv, 4294967295 +; CHECK: %trunc = trunc i64 %bf.value to i32 +; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8 + +%class.A3 = type { i40 } +@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8 + +define void @test3(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8 + %bf.value = and i64 %conv, 4294967295 + %bf.clear = and i64 %bf.load, -4294967296 + %bf.set = or i64 %bf.clear, %bf.value + store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8 + ret void +} + +; class A4 { +; unsigned long f1:13; +; unsigned long f2:3; +; } a4; +; a4.f1 = n; +; The bitfield store cannot be shrinked because the field is not 8/16/32 bits. +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: %t0 = trunc i32 %n to i16 +; CHECK-NEXT: %bf.value = and i16 %t0, 8191 +; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192 +; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value +; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: ret void + +%class.A4 = type { i16, [6 x i8] } +@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8 + +define void @test4(i32 %n) { +entry: + %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + %t0 = trunc i32 %n to i16 + %bf.value = and i16 %t0, 8191 + %bf.clear3 = and i16 %bf.load, -8192 + %bf.set = or i16 %bf.clear3, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + ret void +} + +; class A5 { +; unsigned long f1:3; +; unsigned long f2:16; +; } a5; +; a5.f2 = n; +; The bitfield store cannot be shrinked because it is not aligned on +; 16bits boundary. +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: %bf.value = and i32 %n, 65535 +; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3 +; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281 +; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl +; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: ret void + +%class.A5 = type { i24, [4 x i8] } +@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8 + +define void @test5(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 3 + %bf.clear = and i32 %bf.load, -524281 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 + ret void +} + +; class A6 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a6; +; a6.f1 = n; +; The bitfield store can be shrinked from i32 to i16 even the load and store +; are in different BasicBlocks. +; CHECK-LABEL: @test6( +; CHECK: if.end: +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8 + +%class.A6 = type { i24, [4 x i8] } +@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8 + +define void @test6(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8 + %bf.clear = and i32 %bf.load, 65535 + %cmp = icmp eq i32 %bf.clear, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %bf.value = and i32 %n, 65535 + %bf.clear3 = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear3, %bf.value + store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; class A7 { +; unsigned long f1:16; +; unsigned long f2:16; +; } a7; +; a7.f2 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test7( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %bf.shl = shl i32 %bf.value, 16 +; CHECK: %lshr = lshr i32 %bf.shl, 16 +; CHECK: %trunc = trunc i32 %lshr to i16 +; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2 + +%class.A7 = type { i32, [4 x i8] } +@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8 + +define void @test7(i32 %n) { +entry: + %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 16 + %bf.clear = and i32 %bf.load, 65535 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_or(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i8. +; CHECK-LABEL: @i24_and( +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -7 +; CHECK: store i8 %and.trunc, i8* %uglygep, align 1 +; +define void @i24_and(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = and i24 %aa, -1537 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_xor( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %xor.trunc = xor i16 %load.trunc, 384 +; CHECK: store i16 %xor.trunc, i16* %cast, align 1 +; +define void @i24_xor(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = xor i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_and_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_and_or(i24* %a) { + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i24 store to i8. +; CHECK-LABEL: @i24_insert_bit( +; CHECK: %extbit = zext i1 %bit to i24 +; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13 +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %lshr = lshr i24 %extbit.shl, 8 +; CHECK: %trunc = trunc i24 %lshr to i8 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or( +; CHECK: %cast = bitcast i56* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i56_or(i56* %a) { + %aa = load i56, i56* %a, align 1 + %b = or i56 %aa, 384 + store i56 %b, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_and_or( +; CHECK: %cast = bitcast i56* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i56_and_or(i56* %a) { + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store to i8. +; CHECK-LABEL: @i56_insert_bit( +; CHECK: %extbit = zext i1 %bit to i56 +; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13 +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %lshr = lshr i56 %extbit.shl, 8 +; CHECK: %trunc = trunc i56 %lshr to i8 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or_alg2( +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i16* +; CHECK: %load.trunc = load i16, i16* %cast1, align 2 +; CHECK: %or.trunc = or i16 %load.trunc, 272 +; CHECK: store i16 %or.trunc, i16* %cast1, align 2 +; +define void @i56_or_alg2(i56* %a) { + %aa = load i56, i56* %a, align 2 + %b = or i56 %aa, 17825792 + store i56 %b, i56* %a, align 2 + ret void +} + + Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -385,6 +385,7 @@ initializeTarget(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. + initializeMemAccessShrinkingPassPass(Registry); initializeCodeGenPreparePass(Registry); initializeAtomicExpandPass(Registry); initializeRewriteSymbolsLegacyPassPass(Registry);