Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -64,6 +64,9 @@ MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag, bool AbortOnFailedISel); + /// This pass shrinks some load/store to narrower legal type accesses. + FunctionPass *createMemAccessShrinkingPass(const TargetMachine *TM = nullptr); + /// createCodeGenPreparePass - Transform the code to expose more pattern /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr); Index: include/llvm/IR/PatternMatch.h =================================================================== --- include/llvm/IR/PatternMatch.h +++ include/llvm/IR/PatternMatch.h @@ -317,6 +317,9 @@ /// \brief Match a ConstantInt, capturing the value if we match. inline bind_ty m_ConstantInt(ConstantInt *&CI) { return CI; } +/// \brief Match a load instruction, capturing the value if we match. +inline bind_ty m_Load(LoadInst *&LI) { return LI; } + /// \brief Match a Constant, capturing the value if we match. inline bind_ty m_Constant(Constant *&C) { return C; } Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -243,6 +243,7 @@ void initializeMachineSinkingPass(PassRegistry&); void initializeMachineTraceMetricsPass(PassRegistry&); void initializeMachineVerifierPassPass(PassRegistry&); +void initializeMemAccessShrinkingPassPass(PassRegistry&); void initializeMemCpyOptLegacyPassPass(PassRegistry&); void initializeMemDepPrinterPass(PassRegistry&); void initializeMemDerefPrinterPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -166,6 +166,7 @@ (void) llvm::createCountingFunctionInserterPass(); (void) llvm::createEarlyCSEPass(); (void) llvm::createGVNHoistPass(); + (void) llvm::createMemAccessShrinkingPass(); (void) llvm::createMergedLoadStoreMotionPass(); (void) llvm::createGVNPass(); (void) llvm::createNewGVNPass(); Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1900,6 +1900,14 @@ return false; } + /// Return true if it's expensive to narrow operations of type VT1 to + /// VT2. Sometimes we want to try load/store shrinking to save extra + /// bit manipulation operations. We need to make sure the shrinking will + /// not be so expensive to offset the benefit. + virtual bool isNarrowingExpensive(EVT /*VT1*/, EVT /*VT2*/) const { + return true; + } + /// \brief Return true if it is beneficial to convert a load of a constant to /// just the constant itself. /// On some targets it might be more efficient to use a combination of Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -84,6 +84,7 @@ MachineSSAUpdater.cpp MachineTraceMetrics.cpp MachineVerifier.cpp + MemAccessShrinking.cpp PatchableFunction.cpp MIRPrinter.cpp MIRPrintingPass.cpp Index: lib/CodeGen/CodeGen.cpp =================================================================== --- lib/CodeGen/CodeGen.cpp +++ lib/CodeGen/CodeGen.cpp @@ -48,6 +48,7 @@ initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCSEPass(Registry); + initializeMemAccessShrinkingPassPass(Registry); initializeImplicitNullChecksPass(Registry); initializeMachineCombinerPass(Registry); initializeMachineCopyPropagationPass(Registry); Index: lib/CodeGen/MemAccessShrinking.cpp =================================================================== --- lib/CodeGen/MemAccessShrinking.cpp +++ lib/CodeGen/MemAccessShrinking.cpp @@ -0,0 +1,1065 @@ +//===- MemAccessShrinking.cpp - Shrink the type of mem accesses -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// The pass tries to find opportunities where we can save bit manipulation +/// operations or convert illegal type operations to legal type by shrinking +/// the type of memory accesses. +/// +/// A major motivation of the optimization: +/// Consecutive bitfields are now wrapped as a group and represented as a +/// large integer. To access a specific bitfield, wide load/store plus a +/// series of bit operations are needed. However, if the bitfield is of legal +/// type, it is more efficient to access it directly. Another case is the +/// bitfield group may be of illegal integer type, if we only access one +/// bitfield in the group, we may have chance to shrink the illegal type +/// to a smaller legal type, and save some bit mask operations. +/// Although the major motivation of the optimization is about bitfield +/// accesses, it is not limited to them and can be applied to general memory +/// accesses with similar patterns. +/// +/// List some typical patterns and the way to transform them, in order to +/// give an idea how the optimization works. More general patterns to recognize +/// can be found in function head comments. +/// +/// P1: store(or(and(load P, Cst), MaskedVal), P) +/// If the bit operations show the range (width+shift, shift] of MaskedVal +/// is extracted and filled into the target memory. It will be transformed +/// to: +/// narrow_store(trunc(lshr(MaskedVal, shift) to width), P+shift) +/// Note: It is often the case that the pattern "trunc(lshr(MaskedVal, ...)" +/// can be further simplified during DAG Combiner, which will increase the +/// benefit. +/// +/// P2: and(lshr(load P, Cst1), Cst2) +/// If the bit operations show the range (width+shift, shift] of load P is +/// the actual valid bits to be used. It will be transformed to: +/// narrow_load(P+shift) with its type shrinked to 'width' bits integer. +/// +/// Some notes about algorithm: +/// * The algorithm scans the program and tries to recognize and transform the +/// patterns above. It runs iteratively until no more change has happened. +/// * To prevent the optimization from blocking load/store coalescing, it is +/// invoked late in the pipeline, just before CodeGenPrepare. In this late +/// stage, both the pattern matching and related safety check become more +/// difficult because of previous optimizations introducing mergd load/store +/// and more complex control flow. That is why MemorySSA is used here. It is +/// scalable and precise for the safety check especially when we tries to +/// insert a shrinked load in the block which is many blocks away from its +/// original load. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/MemorySSA.h" +#include "llvm/Transforms/Utils/MemorySSAUpdater.h" + +#define DEBUG_TYPE "memaccessshrink" + +using namespace llvm; +using namespace llvm::PatternMatch; + +STATISTIC(NumStoreShrinked, "Number of stores shrinked"); +STATISTIC(NumLoadShrinked, "Number of Loads shrinked"); + +static cl::opt EnableLoadShrink("enable-load-shrink", cl::init(true)); +static cl::opt EnableStoreShrink("enable-store-shrink", cl::init(true)); + +namespace { +/// Describe the value range used for mem access. +struct ModRange { + unsigned Shift; + unsigned Width; + bool ShrinkWithMaskedVal; +}; + +/// \brief The memory access shrinking pass. +class MemAccessShrinkingPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + MemAccessShrinkingPass(const TargetMachine *TM = nullptr) : FunctionPass(ID) { + this->TM = TM; + initializeMemAccessShrinkingPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + StringRef getPassName() const override { return "MemAccess Shrinking"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + } + +private: + const DataLayout *DL = nullptr; + const DominatorTree *DT = nullptr; + const TargetMachine *TM = nullptr; + const TargetLowering *TLI = nullptr; + const TargetLibraryInfo *TLInfo = nullptr; + LLVMContext *Context; + + MemorySSA *MSSA; + MemorySSAWalker *MSSAWalker; + std::unique_ptr MSSAUpdater; + + // Candidate instructions to be erased if they have no other uses. + SmallPtrSet CandidatesToErase; + + // MultiUsesSeen shows a multiuse node is found on the du-chain. + bool MultiUsesSeen; + // SavedInsts shows how many instructions saved for a specific shrink so far. + // It is used to evaluate whether some instructions will be saved for the + // shrinking especially when MultiUsesSeen is true. + unsigned SavedInsts; + + void analyzeOrAndPattern(Value &MaskedVal, ConstantInt &Cst, ModRange &MR, + unsigned TBits); + void analyzeBOpPattern(Value &Val, ConstantInt &Cst, ModRange &MR, + unsigned TBits); + bool updateModRange(ModRange &MR, unsigned TBits, unsigned Align); + bool hasClobberBetween(Instruction &From, Instruction &To); + bool needNewStore(Value &OrigVal, StoreInst &SI); + Value *createNewPtr(Value *Ptr, unsigned StOffset, Type *NewTy, + IRBuilder<> &Builder); + + bool reduceLoadOpsStoreWidth(StoreInst &SI); + bool tryShrinkOnInst(Instruction &Inst); + bool reduceLoadOpsWidth(Instruction &IN); + Value *shrinkInsts(Value *NewPtr, ModRange &MR, + SmallVectorImpl &Insts, + Instruction *NewInsertPt, unsigned ResShift, + IRBuilder<> &Builder); + bool matchInstsToShrink(Value *Val, ModRange &MR, + SmallVectorImpl &Insts); + bool hasClobberBetween(Instruction &From, Instruction &To, + const MemoryLocation &Mem, Instruction *&NewInsertPt); + void addSavedInst(Instruction &Inst, bool ReplaceAllUses = false); + bool isBeneficial(unsigned ResShift, SmallVectorImpl &Insts); + void markCandForDeletion(Instruction &I) { CandidatesToErase.insert(&I); } + bool removeDeadInsts(); +}; +} // namespace + +char MemAccessShrinkingPass::ID = 0; +INITIALIZE_TM_PASS_BEGIN(MemAccessShrinkingPass, "memaccessshrink", + "MemAccess Shrinking", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_TM_PASS_END(MemAccessShrinkingPass, "memaccessshrink", + "MemAccess Shrinking", false, false) + +FunctionPass *llvm::createMemAccessShrinkingPass(const TargetMachine *TM) { + return new MemAccessShrinkingPass(TM); +} + +/// Analyze or ((and (load P), \p Cst), \p MaskedVal). Update \p MR.Width +/// with the number of bits of the original load to be modified, and update +/// \p MR.Shift with the pos of the first bit to be modified. If the +/// analysis result shows we can use bits extracted from MaskedVal as store +/// value, set \p MR.ShrinkWithMaskedVal to be true. +void MemAccessShrinkingPass::analyzeOrAndPattern(Value &MaskedVal, + ConstantInt &Cst, ModRange &MR, + unsigned TBits) { + // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We + // will handle patterns like either 0..01..1 or 1..10..01..1 + APInt Mask = Cst.getValue().sextOrTrunc(TBits); + assert(Mask.getBitWidth() == TBits && "Unexpected bitwidth of Mask"); + unsigned MaskLeadOnes = Mask.countLeadingOnes(); + if (MaskLeadOnes == TBits) + return; + unsigned MaskTrailOnes = Mask.countTrailingOnes(); + unsigned MaskMidZeros = !MaskLeadOnes + ? Mask.countLeadingZeros() + : Mask.ashr(MaskTrailOnes).countTrailingZeros(); + + MR.ShrinkWithMaskedVal = true; + // See if we have a continuous run of zeros. + if (MaskLeadOnes + MaskMidZeros + MaskTrailOnes != TBits) { + MaskMidZeros = TBits - MaskLeadOnes - MaskTrailOnes; + MR.ShrinkWithMaskedVal = false; + } + + // Check MaskedVal only provides nonzero bits within range from lowbits + // (MaskTrailOnes) to highbits (MaskTrailOnes + MaskMidZeros). + APInt BitMask = + ~APInt::getBitsSet(TBits, MaskTrailOnes, MaskTrailOnes + MaskMidZeros); + + // Find out the range in which 1 appears in MaskedVal. + APInt KnownOne(TBits, 0), KnownZero(TBits, 0); + computeKnownBits(&MaskedVal, KnownZero, KnownOne, *DL, 0); + + MR.Shift = MaskTrailOnes; + MR.Width = MaskMidZeros; + // Expect the bits being 1 in BitMask are all KnownZero bits in MaskedVal, + // otherwise we need to set ShrinkWithMaskedVal to false and expand MR. + if ((KnownZero & BitMask) != BitMask) { + MR.ShrinkWithMaskedVal = false; + // Lower is the first bit for which we are not sure about the fact of + // its being zero. + unsigned Lower = KnownZero.countTrailingOnes(); + // Higher is the last bit for which we are not sure about the fact of + // its being zero. + unsigned Higher = TBits - KnownZero.countLeadingOnes(); + MR.Shift = std::min(Lower, MaskTrailOnes); + MR.Width = std::max(Higher, MaskTrailOnes + MaskMidZeros) - MR.Shift; + } +} + +/// Analyze \p Val = or/xor/and ((load P), \p Cst). Update \p MR.Width +/// with the number of bits of the original load to be modified, and update +/// \p MR.Shift with the pos of the first bit to be modified. +void MemAccessShrinkingPass::analyzeBOpPattern(Value &Val, ConstantInt &Cst, + ModRange &MR, unsigned TBits) { + APInt Mask = Cst.getValue().sextOrTrunc(TBits); + BinaryOperator *BOP = cast(&Val); + if (BOP->getOpcode() == Instruction::And) + Mask = ~Mask; + + MR.Shift = Mask.countTrailingZeros(); + MR.Width = Mask.getBitWidth() - MR.Shift; + if (MR.Width) + MR.Width = MR.Width - Mask.countLeadingZeros(); + MR.ShrinkWithMaskedVal = false; +} + +/// Update \p MR.Width and \p MR.Shift so the updated \p MR.Width +/// bits can form a legal type and also cover all the modified bits. +bool MemAccessShrinkingPass::updateModRange(ModRange &MR, unsigned TBits, + unsigned Align) { + ModRange NewMR; + NewMR.Width = PowerOf2Ceil(MR.Width); + Type *NewTy = Type::getIntNTy(*Context, NewMR.Width); + + // Check if we can find a new Shift for the Width of NewMR, so that + // NewMR forms a new range covering the old modified range without + // worsening alignment. + auto coverOldRange = [&](ModRange &NewMR, ModRange &MR) -> bool { + unsigned MAlign = MinAlign(Align, DL->getABITypeAlignment(NewTy)); + int Shift = MR.Shift - MR.Shift % (MAlign * 8); + while (Shift >= 0) { + if (NewMR.Width + Shift <= TBits && + NewMR.Width + Shift >= MR.Width + MR.Shift) { + NewMR.Shift = Shift; + return true; + } + Shift -= MAlign * 8; + } + return false; + }; + // See whether we can store NewTy legally. + auto isStoreLegalType = [&](Type *NewTy) -> bool { + EVT OldEVT = + TLI->getValueType(*DL, Type::getIntNTy(*Context, PowerOf2Ceil(TBits))); + EVT NewEVT = TLI->getValueType(*DL, NewTy); + return TLI->isOperationLegalOrCustom(ISD::STORE, NewEVT) || + TLI->isTruncStoreLegalOrCustom(OldEVT, NewEVT); + }; + // Try to find the minimal NewMR.Width which can form a legal type and cover + // all the old modified bits. + while (NewMR.Width < TBits && + (!isStoreLegalType(NewTy) || !coverOldRange(NewMR, MR))) { + NewMR.Width = NextPowerOf2(NewMR.Width); + NewTy = Type::getIntNTy(*Context, NewMR.Width); + } + MR.Width = NewMR.Width; + MR.Shift = NewMR.Shift; + + if (MR.Width >= TBits) + return false; + return true; +} + +/// Compute the offset used to compute the new ptr address. It will be +/// mainly based on MR and the endian of the target. +static unsigned computeStOffset(ModRange &MR, unsigned TBits, + const DataLayout &DL) { + unsigned StOffset; + if (DL.isBigEndian()) + StOffset = TBits - MR.Shift - MR.Width; + else + StOffset = MR.Shift; + + if (StOffset % 8 != 0) + MR.ShrinkWithMaskedVal = false; + + return StOffset / 8; +} + +/// Check whether V1 and V2 has the same ptr value by looking through bitcast. +static bool hasSamePtr(Value *V1, Value *V2) { + Value *NV1 = nullptr; + Value *NV2 = nullptr; + if (V1 == V2) + return true; + if (BitCastInst *BC1 = dyn_cast(V1)) + NV1 = BC1->getOperand(0); + if (BitCastInst *BC2 = dyn_cast(V2)) + NV2 = BC2->getOperand(0); + if (!NV1 && !NV2) + return false; + if (V1 == NV2 || V2 == NV1 || NV1 == NV2) + return true; + return false; +} + +/// Check whether there is any clobber instruction between \p From and \p To +/// for the memory location accessed by \p To. +bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From, + Instruction &To) { + assert(MSSA->getMemoryAccess(&To) && "Expect To has valid MemoryAccess"); + MemoryAccess *FromAccess = MSSA->getMemoryAccess(&From); + assert(FromAccess && "Expect From has valid MemoryAccess"); + MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(&To); + if (FromAccess != DefiningAccess && + MSSA->dominates(FromAccess, DefiningAccess)) + return true; + return false; +} + +/// It is possible that we already have a store of OrigVal previously and +/// it is not clobbered, then we can use it and don't have to generate a +/// a new store. +bool MemAccessShrinkingPass::needNewStore(Value &OrigVal, StoreInst &SI) { + for (User *U : OrigVal.users()) { + StoreInst *PrevSI = dyn_cast(U); + if (!PrevSI || + !hasSamePtr(PrevSI->getPointerOperand(), SI.getPointerOperand())) + continue; + if (!hasClobberBetween(*PrevSI, SI)) + return false; + } + return true; +} + +/// Create new address to be used by shrinked load/store based on original +/// address \p Ptr, offset \p StOffset and the new type \p NewTy. +Value *MemAccessShrinkingPass::createNewPtr(Value *Ptr, unsigned StOffset, + Type *NewTy, IRBuilder<> &Builder) { + Value *NewPtr = Ptr; + unsigned AS = cast(Ptr->getType())->getAddressSpace(); + if (StOffset) { + ConstantInt *Idx = ConstantInt::get(Type::getInt32Ty(*Context), StOffset); + NewPtr = + Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(*Context, AS), "cast"); + NewPtr = + Builder.CreateGEP(Type::getInt8Ty(*Context), NewPtr, Idx, "uglygep"); + } + return Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS), "cast"); +} + +/// Try to shrink the store when the input \p SI has one of the patterns: +/// Pattern1: or(and(Val, Cst), MaskedVal). +/// Pattern2: or/and/xor(load, Cst). +/// For the first pattern, when the Cst and MaskedVal satisfies some +/// requirements, the or+and pair has the property that only a portion of +/// Val is modified and the rest of it are not changed. We want to shrink +/// the store to be aligned to the modified range of the Val. +/// Pattern1 after the shrinking looks like: +/// +/// store(Val) // The store can be omitted if the Val is a load +/// // with the same address as the original store. +/// narrow_store(shrinked MaskedVal) +/// +/// However, if some condition doesn't satisfy, which will be indicated by +/// MR::ShrinkWithMaskedVal being false, We may try another type of shrinking +/// -- shrink the store to legal type if it is of illegal type. So for +/// Pattern2 and Pattern1 when Val is load and MR::ShrinkWithMaskedVal is +/// false, if the store is of illegal type, we may shrink them to: +/// +/// store(or(and(shrinked load, shrinked Cst), shrinked MaskedVal)), or +/// store(or/and/xor(shrinked load, shrinked Cst)) +/// +/// After the shrinking, all the operations will be of legal type. +bool MemAccessShrinkingPass::reduceLoadOpsStoreWidth(StoreInst &SI) { + Value *Val = SI.getOperand(0); + Value *Ptr = SI.getOperand(1); + Type *StoreTy = Val->getType(); + if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy()) + return false; + + if (SI.isVolatile() || !SI.isUnordered()) + return false; + + unsigned TBits = DL->getTypeSizeInBits(StoreTy); + if (TBits != DL->getTypeStoreSizeInBits(StoreTy)) + return false; + + LoadInst *LI = nullptr; + Value *OrigVal = nullptr; + Value *MaskedVal; + ConstantInt *Cst; + // Match or(and(Val, Cst), MaskedVal) pattern or their correspondents + // after commutation. + // However the pattern matching below is more complex than that because + // of the commutation and some matching preference we expect. + // for case: or(and(or(and(LI, Cst_1), MaskedVal_1), Cst_2), MaskedVal_2) + // and if MaskedVal_2 happens to be another and operator, we hope we can + // match or(and(LI, Cst_1), MaskedVal_1) to Val instead of MaskedVal. + bool OrAndPattern = + match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)), + m_Value(MaskedVal))) || + (match(Val, m_Or(m_And(m_Value(OrigVal), m_ConstantInt(Cst)), + m_Value(MaskedVal))) && + match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) || + (match(Val, m_Or(m_Value(MaskedVal), + m_And(m_Value(OrigVal), m_ConstantInt(Cst)))) && + match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))); + // Match "or/and/xor(load, cst)" pattern or their correspondents after + // commutation. + bool BopPattern = match(Val, m_c_Or(m_Load(LI), m_ConstantInt(Cst))) || + match(Val, m_c_And(m_Load(LI), m_ConstantInt(Cst))) || + match(Val, m_c_Xor(m_Load(LI), m_ConstantInt(Cst))); + if (!OrAndPattern && !BopPattern) + return false; + + // LI should have the same address as SI. + if (LI && !hasSamePtr(LI->getPointerOperand(), Ptr)) + return false; + + if (LI && (LI->isVolatile() || !LI->isUnordered())) + return false; + + // Make sure the memory location of LI/SI is not clobbered between them. + if (LI && hasClobberBetween(*LI, SI)) + return false; + + // Analyze MR which indicates the range of the input that will actually + // be modified and stored. + ModRange MR = {0, 0, true}; + if (OrAndPattern) + analyzeOrAndPattern(*MaskedVal, *Cst, MR, TBits); + if (BopPattern) + analyzeBOpPattern(*Val, *Cst, MR, TBits); + + assert(MR.Shift + MR.Width <= TBits && "Unexpected ModRange!"); + if (!MR.Width) + return false; + + unsigned StOffset; + if (MR.ShrinkWithMaskedVal) { + // Get the offset from Ptr for the shrinked store. + StOffset = computeStOffset(MR, TBits, *DL); + + // If MR.Width is not the length of legal type, we cannot + // store MaskedVal directly. + if (MR.Width != 8 && MR.Width != 16 && MR.Width != 32) + MR.ShrinkWithMaskedVal = false; + } + + unsigned Align = SI.getAlignment(); + // If we are shrink illegal type of store with original val, update MR.Width + // and MR.Shift to ensure the shrinked store is of legal type. + if (!MR.ShrinkWithMaskedVal) { + // Check whether the store is of illegal type. If it is not, don't bother. + if (!TLI || TLI->isOperationLegalOrCustom(ISD::STORE, + TLI->getValueType(*DL, StoreTy))) + return false; + // Try to find out a MR of legal type. + if (!updateModRange(MR, TBits, Align)) + return false; + // It is meaningless to shrink illegal type store for and(or(Val, ...) + // pattern if Val is not a load, because we still have to insert another + // illegal type store for Val. + if (OrAndPattern && !LI) + return false; + + StOffset = computeStOffset(MR, TBits, *DL); + } + IntegerType *NewTy = Type::getIntNTy(*Context, MR.Width); + if (TLI->isNarrowingExpensive(EVT::getEVT(StoreTy), EVT::getEVT(NewTy))) + return false; + + // Start shrinking the size of the store. + IRBuilder<> Builder(*Context); + Builder.SetInsertPoint(&SI); + Value *NewPtr = createNewPtr(Ptr, StOffset, NewTy, Builder); + + if (StOffset) + Align = MinAlign(StOffset, Align); + + LoadInst *NewLI; + if (MR.ShrinkWithMaskedVal) { + // Check if we need to split the original store and generate a new + // store for the OrigVal. + if (OrigVal && needNewStore(*OrigVal, SI)) { + StoreInst *NewSI = cast(SI.clone()); + NewSI->setOperand(0, OrigVal); + NewSI->setOperand(1, Ptr); + Builder.Insert(NewSI); + DEBUG(dbgs() << "MemShrink: Insert" << *NewSI << " before" << SI << "\n"); + // MemorySSA update for the new store. + MemoryDef *OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + MemoryDef *NewMemAcc = cast( + MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc)); + MSSAUpdater->insertDef(NewMemAcc, false); + } + } else { + // MemorySSA update for the shrinked load. + NewLI = cast(LI->clone()); + NewLI->mutateType(NewTy); + NewLI->setOperand(0, NewPtr); + NewLI->setAlignment(Align); + Builder.Insert(NewLI, "load.trunc"); + MemoryDef *OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + MSSAUpdater->createMemoryAccessBefore(NewLI, Def, OldMemAcc); + } + + // Create the New Value to store. + Value *NewVal = nullptr; + APInt ModifiedCst = Cst->getValue().lshr(MR.Shift).trunc(MR.Width); + ConstantInt *NewCst = ConstantInt::get(*Context, ModifiedCst); + if (OrAndPattern) { + // Shift and truncate MaskedVal. + Value *Trunc; + if (auto MVCst = dyn_cast(MaskedVal)) { + ModifiedCst = MVCst->getValue().lshr(MR.Shift).trunc(MR.Width); + Trunc = ConstantInt::get(*Context, ModifiedCst); + } else { + Value *ShiftedVal = MR.Shift + ? Builder.CreateLShr(MaskedVal, MR.Shift, "lshr") + : MaskedVal; + Trunc = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc"); + } + if (MR.ShrinkWithMaskedVal) { + NewVal = Trunc; + } else { + Value *NewAnd = Builder.CreateAnd(NewLI, NewCst, "and.trunc"); + NewVal = Builder.CreateOr(NewAnd, Trunc, "or.trunc"); + } + } else { + BinaryOperator *BOP = cast(Val); + switch (BOP->getOpcode()) { + default: + llvm_unreachable("BOP can only be And/Or/Xor"); + case Instruction::And: + NewVal = Builder.CreateAnd(NewLI, NewCst, "and.trunc"); + break; + case Instruction::Or: + NewVal = Builder.CreateOr(NewLI, NewCst, "or.trunc"); + break; + case Instruction::Xor: + NewVal = Builder.CreateXor(NewLI, NewCst, "xor.trunc"); + break; + } + } + + // Create the new store and remove the old one. + StoreInst *NewSI = cast(SI.clone()); + NewSI->setOperand(0, NewVal); + NewSI->setOperand(1, NewPtr); + NewSI->setAlignment(Align); + Builder.Insert(NewSI); + + DEBUG(dbgs() << "MemShrink: Replace" << SI << " with" << *NewSI << "\n"); + // MemorySSA update for the shrinked store. + MemoryDef *OldMemAcc = cast(MSSA->getMemoryAccess(&SI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + MemoryAccess *NewMemAcc = + MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc); + OldMemAcc->replaceAllUsesWith(NewMemAcc); + MSSAUpdater->removeMemoryAccess(OldMemAcc); + + markCandForDeletion(*cast(Val)); + SI.eraseFromParent(); + NumStoreShrinked++; + return true; +} + +/// The driver of shrinking and final dead instructions cleanup. +bool MemAccessShrinkingPass::tryShrinkOnInst(Instruction &Inst) { + StoreInst *SI = dyn_cast(&Inst); + if (EnableStoreShrink && SI) + return reduceLoadOpsStoreWidth(*SI); + if (EnableLoadShrink && (isa(&Inst) || isa(&Inst))) + return reduceLoadOpsWidth(Inst); + return false; +} + +/// Check the range of Cst containing non-zero bit is within \p MR when +/// \p AInB is true. If \p AInB is false, check MR is within the range +/// of Cst. +static bool CompareAPIntWithModRange(APInt &AI, ModRange &MR, bool AInB) { + unsigned LZ = AI.countLeadingZeros(); + unsigned TZ = AI.countTrailingZeros(); + unsigned TBits = AI.getBitWidth(); + if (AInB && (TZ < MR.Shift || (TBits - LZ) > MR.Width + MR.Shift)) + return false; + + if (!AInB && (MR.Shift < TZ || MR.Width + MR.Shift > (TBits - LZ))) + return false; + return true; +} + +/// Check if there is overlap between range \MR and \OtherMR. +static bool nonoverlap(ModRange &MR, ModRange &OtherMR) { + return (MR.Shift > OtherMR.Shift + OtherMR.Width - 1) || + (OtherMR.Shift > MR.Shift + MR.Width - 1); +} + +/// Check there is no instruction between *LI and IN which may clobber +/// the MemoryLocation specified by MR. We relax it a little bit to allow +/// clobber instruction in the same BB as IN, if that happens we need to +/// set the NewInsertPt to be the new location to insert the shrinked load. +bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From, + Instruction &To, + const MemoryLocation &Mem, + Instruction *&NewInsertPt) { + assert(DT->dominates(&From, &To) && "From doesn't dominate To"); + const BasicBlock *FBB = From.getParent(); + const BasicBlock *TBB = To.getParent(); + // Collect BasicBlocks to scan between FBB and TBB into BBSet. + // Limit the maximum number of BasicBlocks to 3 to protect compile time. + SmallPtrSet BBSet; + SmallVector Worklist; + BBSet.insert(FBB); + BBSet.insert(TBB); + Worklist.push_back(TBB); + do { + const BasicBlock *BB = Worklist.pop_back_val(); + for (const BasicBlock *Pred : predecessors(BB)) { + if (!BBSet.count(Pred)) { + BBSet.insert(Pred); + Worklist.push_back(Pred); + } + } + } while (!Worklist.empty()); + + for (const BasicBlock *BB : BBSet) { + const MemorySSA::DefsList *DList = MSSA->getBlockDefs(BB); + if (!DList) + continue; + + for (const MemoryAccess &MA : *DList) { + if (const MemoryDef *MD = dyn_cast(&MA)) { + Instruction *Inst = MD->getMemoryInst(); + if ((FBB == Inst->getParent() && DT->dominates(Inst, &From)) || + (TBB == Inst->getParent() && DT->dominates(&To, Inst))) + continue; + if (MSSAWalker->getClobberingMemoryAccess(const_cast(MD), + Mem) == MD) { + if (TBB == Inst->getParent()) { + if (!NewInsertPt || DT->dominates(Inst, NewInsertPt)) + NewInsertPt = Inst; + continue; + } + return true; + } + } + } + } + return false; +} + +/// Match \p Val to the pattern: +/// Bop(...Bop(V, Cst_1), Cst_2, ..., Cst_N) and pattern: +/// or(and(...or(and(LI, Cst_1), MaskedVal_1), ..., Cse_N), MaskedVal_N), +/// and find the final load to be shrinked. +/// All the Bop instructions in the first pattern and the final load will +/// be added into \p Insts and will be shrinked afterwards. +bool MemAccessShrinkingPass::matchInstsToShrink( + Value *Val, ModRange &MR, SmallVectorImpl &Insts) { + Value *Opd; + ConstantInt *Cst; + unsigned TBits = DL->getTypeSizeInBits(Val->getType()); + + // Match the pattern: + // Bop(...Bop(V, Cst_1), Cst_2, ..., Cst_N), Bop can be Add/Sub/And/Or/Xor. + // All these Csts cannot have one bits outside of range MR, so that we can + // shrink these Bops to be MR.Width bits integer type. + while (match(Val, m_Add(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_Sub(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_And(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_Or(m_Value(Opd), m_ConstantInt(Cst))) || + match(Val, m_Xor(m_Value(Opd), m_ConstantInt(Cst)))) { + addSavedInst(*cast(Val)); + APInt AI = Cst->getValue().zextOrTrunc(TBits); + if (!CompareAPIntWithModRange(AI, MR, true)) + return false; + Insts.push_back(cast(Val)); + Val = Opd; + } + + LoadInst *LI; + if ((LI = dyn_cast(Val))) { + addSavedInst(*LI); + Insts.push_back(LI); + return true; + } + + // Match the pattern: + // or(and(...or(and(LI, Cst_1), MaskedVal_1), ..., Cse_N), MaskedVal_N) + // Analyze the ModRange of each or(and(...)) pair and see if it has + // any overlap with MR. If any overlap is found, we cannot do shrinking + // for the final Load LI. + Value *MaskedVal; + Value *OrigVal; + ModRange OtherMR; + while (match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)), + m_Value(MaskedVal))) || + (match(Val, m_Or(m_And(m_Value(OrigVal), m_ConstantInt(Cst)), + m_Value(MaskedVal))) && + match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) || + (match(Val, m_Or(m_Value(MaskedVal), + m_And(m_Value(OrigVal), m_ConstantInt(Cst)))) && + match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value())))) { + addSavedInst(*cast(Val)); + OtherMR.Shift = 0; + OtherMR.Width = TBits; + // Analyze ModRange OtherMR of A from or(and(A, Cst), MaskedVal) pattern. + analyzeOrAndPattern(*MaskedVal, *Cst, OtherMR, TBits); + // If OtherMR has overlap with MR, we cannot shrink load to its MR portion. + if (!nonoverlap(MR, OtherMR)) + return false; + if (LI) { + Insts.push_back(LI); + return true; + } + Val = OrigVal; + } + + return false; +} + +/// Find the next MemoryAccess after LI in the same BB. +MemoryAccess *findNextMemoryAccess(LoadInst &LI, MemorySSA &MSSA) { + for (auto Scan = LI.getIterator(); Scan != LI.getParent()->end(); ++Scan) { + if (MemoryAccess *MA = MSSA.getMemoryAccess(&*Scan)) + return MA; + } + return nullptr; +} + +/// Shrink \p Insts according to the range MR. +Value *MemAccessShrinkingPass::shrinkInsts( + Value *NewPtr, ModRange &MR, SmallVectorImpl &Insts, + Instruction *NewInsertPt, unsigned ResShift, IRBuilder<> &Builder) { + Value *NewVal; + IntegerType *NewTy = Type::getIntNTy(*Context, MR.Width); + Instruction &InsertPt = *Builder.GetInsertPoint(); + for (Instruction *I : make_range(Insts.rbegin(), Insts.rend())) { + if (LoadInst *LI = dyn_cast(I)) { + unsigned TBits = DL->getTypeSizeInBits(LI->getType()); + unsigned StOffset = computeStOffset(MR, TBits, *DL); + unsigned Align = MinAlign(StOffset, LI->getAlignment()); + // If we use a new insertion point, we have to recreate the NewPtr in + // the new place. + if (NewInsertPt) { + Builder.SetInsertPoint(NewInsertPt); + RecursivelyDeleteTriviallyDeadInstructions(NewPtr); + NewPtr = + createNewPtr(LI->getPointerOperand(), StOffset, NewTy, Builder); + } + LoadInst *NewLI = cast(LI->clone()); + NewLI->setOperand(0, NewPtr); + NewLI->mutateType(NewTy); + NewLI->setAlignment(Align); + Builder.Insert(NewLI, "load.trunc"); + NewVal = NewLI; + // Update MemorySSA. + MemoryUseOrDef *OldMemAcc = + cast(MSSA->getMemoryAccess(LI)); + MemoryAccess *Def = OldMemAcc->getDefiningAccess(); + // Find a proper position to insert the new MemoryAccess. + MemoryAccess *Next = findNextMemoryAccess(*NewLI, *MSSA); + if (Next) + MSSAUpdater->createMemoryAccessBefore(NewLI, Def, + cast(Next)); + else + MSSAUpdater->createMemoryAccessInBB(NewLI, Def, NewLI->getParent(), + MemorySSA::End); + } else { + // shrink Add/Sub/And/Xor/Or in Insts. + BinaryOperator *BOP = cast(I); + ConstantInt *Cst = cast(BOP->getOperand(1)); + APInt NewAPInt = Cst->getValue().extractBits(MR.Width, MR.Shift); + ConstantInt *NewCst = + ConstantInt::get(*Context, NewAPInt.zextOrTrunc(MR.Width)); + NewVal = Builder.CreateBinOp(BOP->getOpcode(), NewVal, NewCst); + } + } + // Adjust the type to be consistent with the use of input trunc/and + // instructions. + IntegerType *UseType = cast(InsertPt.getType()); + if (DL->getTypeSizeInBits(UseType) != MR.Width) + NewVal = Builder.CreateZExt(NewVal, UseType, "trunc.zext"); + // Adjust the NewVal with ResShift. + if (ResShift) { + ConstantInt *NewCst = ConstantInt::get(UseType, ResShift); + NewVal = Builder.CreateShl(NewVal, NewCst, "shl"); + } + return NewVal; +} + +/// Determine whether \p Inst will be saved in the final shrinking and update +/// MultiUsesSeen and SavedInsts accordingly. If \p ReplaceAllUses is +/// true, it means we are going to replace all the uses of \p Inst with the +/// shrinked result, so it doesn't matter even if it has multiple uses. +void MemAccessShrinkingPass::addSavedInst(Instruction &Inst, + bool ReplaceAllUses) { + if (!MultiUsesSeen) { + // If the Inst has multiple uses and the current shrinking cannot replace + // all its uses, it will not regarded as a SavedInst. + if (ReplaceAllUses || Inst.hasOneUse()) + SavedInsts += (!isa(Inst) && !isa(Inst)); + else + MultiUsesSeen = true; + } +} + +/// Compare \p SavedInsts with instructions we are about to create and decide +/// whether it is beneficial to do the shrinking. +bool MemAccessShrinkingPass::isBeneficial( + unsigned ResShift, SmallVectorImpl &Insts) { + unsigned InstsToCreate = 0; + if (ResShift) + InstsToCreate++; + InstsToCreate += Insts.size(); + return SavedInsts >= InstsToCreate; +} + +/// When the input instruction \p IN is and(Val, Cst) or trunc, it indicates +/// only a portion of its input value has been used. We will walk through the +/// ud chain, try to find out a final load and find out the range of the load +/// really being used. That is the range we want to shrink the load to. +/// The general pattern we are looking for is: +/// and/trunc(sequence_of_shl_or_lshr(sequence_of_bops( +/// sequence_of_or_and_pair(load P), ... ))) +/// An example: +/// and(lshr(add(or(and(load P, -65536), MaskedVal), 0x1000000000000), 48), +/// 255) +/// The actual modified range of load is [48, 56). The value in the range is +/// incremented by 1. The sequence above will be transformed to: +/// zext(add(load_i8(P + 48), 1), 64) +bool MemAccessShrinkingPass::reduceLoadOpsWidth(Instruction &IN) { + // If the instruction is actually dead, skip it. + if (IN.use_empty()) + return false; + + Type *Ty = IN.getType(); + if (Ty->isVectorTy() || !Ty->isIntegerTy()) + return false; + + MultiUsesSeen = false; + SavedInsts = 0; + + // Match and(Val, Cst) or Trunc(Val) + Value *Val; + ConstantInt *AndCst = nullptr; + if (!match(&IN, m_Trunc(m_Value(Val))) && + !match(&IN, m_And(m_Value(Val), m_ConstantInt(AndCst)))) + return false; + addSavedInst(IN, true); + + // Get initial MR. + ModRange MR; + MR.Shift = 0; + unsigned ResShift = 0; + unsigned TBits = DL->getTypeSizeInBits(Val->getType()); + if (AndCst) { + APInt AI = AndCst->getValue(); + // Check AI has consecutive one bits. The consecutive bits are the + // range to be used. If the num of trailing zeros are not zero, + // remember the num in ResShift and the val after shrinking needs + // to be shifted accordingly. + MR.Shift = AI.countTrailingZeros(); + ResShift = MR.Shift; + if (!(AI.lshr(MR.Shift) + 1).isPowerOf2() || MR.Shift == TBits) + return false; + MR.Width = AI.getActiveBits() - MR.Shift; + } else { + MR.Width = DL->getTypeSizeInBits(Ty); + } + + // Match a series of LShr or Shl. Adjust MR.Shift accordingly. + // Note we have to be careful that valid bits may be cleared during + // back-and-force shifts. We use a all-one Mask APInt to simulate + // the shifts and track the valid bits after shifts. + Value *Opd; + bool isLShr; + unsigned NewShift = MR.Shift; + APInt Mask(TBits, -1); + ConstantInt *Cst; + // Record the series of LShr or Shl in ShiftRecs. + SmallVector, 4> ShiftRecs; + while ((isLShr = match(Val, m_LShr(m_Value(Opd), m_ConstantInt(Cst)))) || + match(Val, m_Shl(m_Value(Opd), m_ConstantInt(Cst)))) { + addSavedInst(*cast(Val)); + NewShift = isLShr ? (NewShift + Cst->getZExtValue()) + : (NewShift - Cst->getZExtValue()); + ShiftRecs.push_back({isLShr, Cst->getZExtValue()}); + Val = Opd; + } + // Iterate ShiftRecs in reverse order. Simulate the shifts of Mask. + for (auto SR : make_range(ShiftRecs.rbegin(), ShiftRecs.rend())) { + if (SR.first) + Mask = Mask.lshr(SR.second); + else + Mask = Mask.shl(SR.second); + } + // If all the bits in MR are one in Mask, MR is valid after the shifts. + if (!CompareAPIntWithModRange(Mask, MR, false)) + return false; + MR.Shift = NewShift; + + // Specify the proper MR we want to handle. + if (MR.Shift + MR.Width > TBits) + return false; + if (MR.Width != 8 && MR.Width != 16 && MR.Width != 32) + return false; + if (MR.Shift % 8 != 0) + return false; + + // Match other Binary operators like Add/Sub/And/Xor/Or or pattern like + // And(Or(...And(Or(Val, Cst)))) and find the final load. + SmallVector Insts; + if (!matchInstsToShrink(Val, MR, Insts)) + return false; + + // Expect the final load has been found here. + assert(!Insts.empty() && "Expect Insts to be not empty"); + LoadInst *LI = dyn_cast(Insts.back()); + assert(LI && "Last elem in Insts should be a LoadInst"); + if (LI->isVolatile() || !LI->isUnordered()) + return false; + + IntegerType *NewTy = Type::getIntNTy(*Context, MR.Width); + if (TLI->isNarrowingExpensive(EVT::getEVT(Ty), EVT::getEVT(NewTy))) + return false; + + // Do legality check to ensure the range of shrinked load is not clobbered + // from *LI to IN. + IRBuilder<> Builder(*Context); + Builder.SetInsertPoint(&IN); + unsigned StOffset = computeStOffset(MR, TBits, *DL); + Value *NewPtr = + createNewPtr(LI->getPointerOperand(), StOffset, NewTy, Builder); + Instruction *NewInsertPt = nullptr; + if (hasClobberBetween(*LI, IN, MemoryLocation(NewPtr, MR.Width / 8), + NewInsertPt)) { + RecursivelyDeleteTriviallyDeadInstructions(NewPtr); + return false; + } + if (!isBeneficial(ResShift, Insts)) { + RecursivelyDeleteTriviallyDeadInstructions(NewPtr); + return false; + } + + // Start to shrink instructions in Ops. + Value *NewVal = + shrinkInsts(NewPtr, MR, Insts, NewInsertPt, ResShift, Builder); + DEBUG(dbgs() << "MemShrink: Replace" << IN << " with" << *NewVal << "\n"); + IN.replaceAllUsesWith(NewVal); + markCandForDeletion(IN); + NumLoadShrinked++; + return true; +} + +/// Check insts in CandidatesToErase. If they are dead, remove the dead +/// instructions recursively and clean up Memory SSA. +bool MemAccessShrinkingPass::removeDeadInsts() { + SmallVector DeadInsts; + bool Changed = false; + + for (Instruction *I : CandidatesToErase) { + if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLInfo)) + continue; + assert(DeadInsts.empty() && "DeadInsts is empty at the beginning"); + DeadInsts.push_back(I); + + do { + I = DeadInsts.pop_back_val(); + + // Null out all of the instruction's operands to see if any operand + // becomes dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty()) + continue; + + // If the operand is an instruction that became dead as we nulled out + // the operand, and if it is 'trivially' dead, delete it in a future + // loop iteration. + if (Instruction *OpI = dyn_cast(OpV)) { + if (isInstructionTriviallyDead(OpI, TLInfo)) + DeadInsts.push_back(OpI); + else + CandidatesToErase.insert(OpI); + } + } + + MemoryAccess *MemAcc = MSSA->getMemoryAccess(I); + if (MemAcc) + MSSAUpdater->removeMemoryAccess(MemAcc); + + CandidatesToErase.erase(I); + I->eraseFromParent(); + + Changed = true; + } while (!DeadInsts.empty()); + } + return Changed; +} + +/// Perform the memaccess shrinking optimization for the given function. +bool MemAccessShrinkingPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) + return false; + + DL = &Fn.getParent()->getDataLayout(); + DT = &getAnalysis().getDomTree(); + TLInfo = &getAnalysis().getTLI(); + MSSA = &getAnalysis().getMSSA(); + MSSAWalker = MSSA->getWalker(); + MSSAUpdater = make_unique(MSSA); + CandidatesToErase.clear(); + + Context = &Fn.getContext(); + + if (TM) + TLI = TM->getSubtargetImpl(Fn)->getTargetLowering(); + + // Iterates the Fn until nothing is changed. + MSSA->verifyMemorySSA(); + bool MadeChange = true; + bool EverMadeChange = false; + while (MadeChange) { + MadeChange = false; + for (BasicBlock *BB : post_order(&Fn)) { + auto CurInstIterator = BB->rbegin(); + while (CurInstIterator != BB->rend()) + MadeChange |= tryShrinkOnInst(*CurInstIterator++); + } + MadeChange |= removeDeadInsts(); + EverMadeChange |= MadeChange; + } + + return EverMadeChange; +} Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -68,6 +68,8 @@ cl::desc("Disable Machine LICM")); static cl::opt DisableMachineSink("disable-machine-sink", cl::Hidden, cl::desc("Disable Machine Sinking")); +static cl::opt DisableMemAccessShrinking("disable-memaccess-shrinking", + cl::Hidden, cl::desc("Disable MemAccessShrinking")); static cl::opt DisableLSR("disable-lsr", cl::Hidden, cl::desc("Disable Loop Strength Reduction Pass")); static cl::opt DisableConstantHoisting("disable-constant-hoisting", @@ -475,6 +477,9 @@ if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting) addPass(createConstantHoistingPass()); + if (getOptLevel() != CodeGenOpt::None && !DisableMemAccessShrinking) + addPass(createMemAccessShrinkingPass(TM)); + if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining) addPass(createPartiallyInlineLibCallsPass()); Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -284,6 +284,8 @@ bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool isNarrowingExpensive(EVT VT1, EVT VT2) const override; + /// Return true if the given shuffle mask can be codegen'd directly, or if it /// should be stack expanded. bool isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6718,6 +6718,10 @@ return SDValue(); } +bool AArch64TargetLowering::isNarrowingExpensive(EVT VT1, EVT VT2) const { + return false; +} + bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const { if (VT.getVectorNumElements() == 4 && Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -428,6 +428,7 @@ Sched::Preference getSchedulingPreference(SDNode *N) const override; + bool isNarrowingExpensive(EVT VT1, EVT VT2) const override; bool isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -6432,6 +6432,10 @@ return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } +bool ARMTargetLowering::isNarrowingExpensive(EVT VT1, EVT VT2) const { + return false; +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -939,6 +939,9 @@ /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + /// Return true if it's expensive to narrow operations of type VT1 to VT2. + bool isNarrowingExpensive(EVT VT1, EVT VT2) const override; + /// Given an intrinsic, checks if on the target the intrinsic will need to map /// to a MemIntrinsicNode (touches memory). If this is the case, it returns /// true and stores the intrinsic information into the IntrinsicInfo that was Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -24338,6 +24338,10 @@ return !(VT1 == MVT::i32 && VT2 == MVT::i16); } +bool X86TargetLowering::isNarrowingExpensive(EVT VT1, EVT VT2) const { + return false; +} + /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values Index: test/CodeGen/AArch64/arm64-fast-isel-conversion.ll =================================================================== --- test/CodeGen/AArch64/arm64-fast-isel-conversion.ll +++ test/CodeGen/AArch64/arm64-fast-isel-conversion.ll @@ -9,12 +9,11 @@ ; CHECK: strh w1, [sp, #12] ; CHECK: str w2, [sp, #8] ; CHECK: str x3, [sp] -; CHECK: ldr x3, [sp] -; CHECK: mov x0, x3 +; CHECK: ldr w0, [sp] ; CHECK: str w0, [sp, #8] -; CHECK: ldr w0, [sp, #8] +; CHECK: ldrh w0, [sp, #8] ; CHECK: strh w0, [sp, #12] -; CHECK: ldrh w0, [sp, #12] +; CHECK: ldrb w0, [sp, #12] ; CHECK: strb w0, [sp, #15] ; CHECK: ldrb w0, [sp, #15] ; CHECK: add sp, sp, #16 @@ -402,10 +401,8 @@ define void @stack_trunc() nounwind { ; CHECK-LABEL: stack_trunc ; CHECK: sub sp, sp, #16 -; CHECK: ldr [[REG:x[0-9]+]], [sp] -; CHECK: mov x[[REG2:[0-9]+]], [[REG]] -; CHECK: and [[REG3:w[0-9]+]], w[[REG2]], #0xff -; CHECK: strb [[REG3]], [sp, #15] +; CHECK: ldrb [[REG:w[0-9]+]], [sp] +; CHECK: strb [[REG]], [sp, #15] ; CHECK: add sp, sp, #16 %a = alloca i8, align 1 %b = alloca i64, align 8 Index: test/CodeGen/AArch64/arm64-fast-isel-gv.ll =================================================================== --- test/CodeGen/AArch64/arm64-fast-isel-gv.ll +++ test/CodeGen/AArch64/arm64-fast-isel-gv.ll @@ -25,7 +25,7 @@ ; CHECK: add [[REG7:x[0-9]+]], [[REG6]], [[REG3]] ; CHECK: and [[REG8:x[0-9]+]], [[REG7]], #0xffff ; CHECK: str [[REG8]], {{\[}}[[REG1]]{{\]}} -; CHECK: ldr {{x[0-9]+}}, {{\[}}[[REG1]]{{\]}} +; CHECK: ldr {{w[0-9]+}}, {{\[}}[[REG1]]{{\]}} %0 = load i64, i64* @seed, align 8 %mul = mul nsw i64 %0, 1309 %add = add nsw i64 %mul, 13849 Index: test/CodeGen/ARM/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -12,13 +12,9 @@ ; ; BE-LABEL: i24_or: ; BE: @ BB#0: -; BE-NEXT: ldrh r1, [r0] -; BE-NEXT: ldrb r2, [r0, #2] -; BE-NEXT: orr r1, r2, r1, lsl #8 +; BE-NEXT: ldrh r1, [r0, #1] ; BE-NEXT: orr r1, r1, #384 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] +; BE-NEXT: strh r1, [r0, #1] ; BE-NEXT: mov pc, lr %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -29,36 +25,23 @@ define void @i24_and_or(i24* %a) { ; LE-LABEL: i24_and_or: ; LE: @ BB#0: -; LE-NEXT: ldrb r1, [r0, #2] -; LE-NEXT: ldrh r2, [r0] -; LE-NEXT: orr r1, r2, r1, lsl #16 -; LE-NEXT: ldr r2, .LCPI1_0 +; LE-NEXT: ldrh r1, [r0] +; LE-NEXT: mov r2, #16256 +; LE-NEXT: orr r2, r2, #49152 ; LE-NEXT: orr r1, r1, #384 ; LE-NEXT: and r1, r1, r2 ; LE-NEXT: strh r1, [r0] -; LE-NEXT: lsr r1, r1, #16 -; LE-NEXT: strb r1, [r0, #2] ; LE-NEXT: mov pc, lr -; LE-NEXT: .p2align 2 -; LE-NEXT: @ BB#1: -; LE-NEXT: .LCPI1_0: -; LE-NEXT: .long 16777088 @ 0xffff80 ; ; BE-LABEL: i24_and_or: ; BE: @ BB#0: -; BE-NEXT: ldrh r1, [r0] -; BE-NEXT: mov r2, #384 -; BE-NEXT: orr r1, r2, r1, lsl #8 -; BE-NEXT: ldr r2, .LCPI1_0 +; BE-NEXT: ldrh r1, [r0, #1] +; BE-NEXT: mov r2, #16256 +; BE-NEXT: orr r2, r2, #49152 +; BE-NEXT: orr r1, r1, #384 ; BE-NEXT: and r1, r1, r2 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] -; BE-NEXT: mov pc, lr -; BE-NEXT: .p2align 2 -; BE-NEXT: @ BB#1: -; BE-NEXT: .LCPI1_0: -; BE-NEXT: .long 16777088 @ 0xffff80 +; BE-NEXT: strh r1, [r0, #1] +; BE-NEXT: mov pc, lr %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 %d = or i24 %c, 384 @@ -69,37 +52,19 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; LE-LABEL: i24_insert_bit: ; LE: @ BB#0: -; LE-NEXT: ldrb r2, [r0, #2] -; LE-NEXT: ldrh r3, [r0] -; LE-NEXT: orr r2, r3, r2, lsl #16 -; LE-NEXT: ldr r3, .LCPI2_0 -; LE-NEXT: and r2, r2, r3 -; LE-NEXT: lsr r3, r2, #16 -; LE-NEXT: orr r1, r2, r1, lsl #13 -; LE-NEXT: strb r3, [r0, #2] -; LE-NEXT: strh r1, [r0] +; LE-NEXT: ldrb r2, [r0, #1] +; LE-NEXT: and r2, r2, #223 +; LE-NEXT: orr r1, r2, r1, lsl #5 +; LE-NEXT: strb r1, [r0, #1] ; LE-NEXT: mov pc, lr -; LE-NEXT: .p2align 2 -; LE-NEXT: @ BB#1: -; LE-NEXT: .LCPI2_0: -; LE-NEXT: .long 16769023 @ 0xffdfff ; ; BE-LABEL: i24_insert_bit: ; BE: @ BB#0: -; BE-NEXT: ldrh r2, [r0] -; BE-NEXT: ldrb r3, [r0, #2] -; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: ldr r3, .LCPI2_0 -; BE-NEXT: and r2, r2, r3 -; BE-NEXT: orr r1, r2, r1, lsl #13 -; BE-NEXT: strb r2, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] -; BE-NEXT: mov pc, lr -; BE-NEXT: .p2align 2 -; BE-NEXT: @ BB#1: -; BE-NEXT: .LCPI2_0: -; BE-NEXT: .long 16769023 @ 0xffdfff +; BE-NEXT: ldrb r2, [r0, #1] +; BE-NEXT: and r2, r2, #223 +; BE-NEXT: orr r1, r2, r1, lsl #5 +; BE-NEXT: strb r1, [r0, #1] +; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 %extbit.shl = shl nuw nsw i24 %extbit, 13 @@ -119,19 +84,9 @@ ; ; BE-LABEL: i56_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: ldrb r3, [r1, #2] -; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldr r1, [r0, #3] +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: str r1, [r0, #3] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a %b = or i56 %aa, 384 @@ -150,19 +105,10 @@ ; ; BE-LABEL: i56_and_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: mov r3, #128 -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: strb r3, [r1, #2] -; BE-NEXT: lsl r2, r2, #8 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldr r1, [r0, #3] +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: bic r1, r1, #127 +; BE-NEXT: str r1, [r0, #3] ; BE-NEXT: mov pc, lr %b = load i56, i56* %a, align 1 @@ -175,31 +121,18 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; LE-LABEL: i56_insert_bit: ; LE: @ BB#0: -; LE-NEXT: ldr r2, [r0] -; LE-NEXT: bic r2, r2, #8192 -; LE-NEXT: orr r1, r2, r1, lsl #13 -; LE-NEXT: str r1, [r0] +; LE-NEXT: ldr r2, [r0, #1] +; LE-NEXT: bic r2, r2, #32 +; LE-NEXT: orr r1, r2, r1, lsl #5 +; LE-NEXT: str r1, [r0, #1] ; LE-NEXT: mov pc, lr ; ; BE-LABEL: i56_insert_bit: ; BE: @ BB#0: -; BE-NEXT: .save {r11, lr} -; BE-NEXT: push {r11, lr} -; BE-NEXT: mov r2, r0 -; BE-NEXT: ldr lr, [r0] -; BE-NEXT: ldrh r12, [r2, #4]! -; BE-NEXT: ldrb r3, [r2, #2] -; BE-NEXT: orr r12, r3, r12, lsl #8 -; BE-NEXT: orr r3, r12, lr, lsl #24 -; BE-NEXT: bic r3, r3, #8192 -; BE-NEXT: orr r1, r3, r1, lsl #13 -; BE-NEXT: strb r3, [r2, #2] -; BE-NEXT: lsr r3, r1, #8 -; BE-NEXT: strh r3, [r2] -; BE-NEXT: bic r2, lr, #255 -; BE-NEXT: orr r1, r2, r1, lsr #24 -; BE-NEXT: str r1, [r0] -; BE-NEXT: pop {r11, lr} +; BE-NEXT: ldr r2, [r0, #2] +; BE-NEXT: bic r2, r2, #32 +; BE-NEXT: orr r1, r2, r1, lsl #5 +; BE-NEXT: str r1, [r0, #2] ; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 Index: test/CodeGen/ARM/load-shrink.ll =================================================================== --- test/CodeGen/ARM/load-shrink.ll +++ test/CodeGen/ARM/load-shrink.ll @@ -0,0 +1,109 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check bitfield load is shrinked properly in cases below. + +; The bitfield store can be shrinked from i64 to i16. +; CHECK-LABEL: @load_and( +; CHECK: %cast = bitcast i64* %ptr to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 8 +; CHECK: %trunc.zext = zext i16 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = and i64 %bf.load, 65535 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_trunc( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %load.trunc = load i8, i8* %cast, align 8 +; CHECK: %cmp = icmp eq i8 %load.trunc, 1 + +define i1 @load_trunc(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = trunc i64 %bf.load to i8 + %cmp = icmp eq i8 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %trunc.zext = zext i8 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.lshr = lshr i64 %bf.load, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr_add( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr_add(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %add = add i64 %bf.load, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_ops( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_ops(i64* %ptr, i64 %value) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.value = and i64 %value, 65535 + %bf.clear1 = and i64 %bf.load, -65536 + %bf.set = or i64 %bf.value, %bf.clear1 + %add = add i64 %bf.set, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; It doesn't worth to do the shrink because %bf.load has multiple uses +; and the shrink here doesn't save instructions. +; CHECK-LABEL: @load_trunc_multiuses +; CHECK: %bf.load = load i64, i64* %ptr, align 8 +; CHECK: %bf.trunc = trunc i64 %bf.load to i16 +; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3 +; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000 +; CHECK: %cmp = and i1 %cmp1, %cmp2 + +define i1 @load_trunc_multiuses(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.trunc = trunc i64 %bf.load to i16 + %cmp1 = icmp eq i16 %bf.trunc, 3 + %cmp2 = icmp ult i64 %bf.load, 1500000 + %cmp = and i1 %cmp1, %cmp2 + ret i1 %cmp +} + Index: test/CodeGen/ARM/store-shrink.ll =================================================================== --- test/CodeGen/ARM/store-shrink.ll +++ test/CodeGen/ARM/store-shrink.ll @@ -0,0 +1,364 @@ +; RUN: opt < %s -mtriple=arm-eabi -memaccessshrink -S | FileCheck %s +; Check bitfield store is shrinked properly in cases below. + +; class A1 { +; unsigned long f1:8; +; unsigned long f2:3; +; } a1; +; a1.f1 = n; +; +; The bitfield store can be shrinked from i16 to i8. +; CHECK-LABEL: @test1( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %t0 = trunc i64 %conv to i16 +; CHECK: %bf.value = and i16 %t0, 255 +; CHECK: %trunc = trunc i16 %bf.value to i8 +; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8 + +%class.A1 = type { i16, [6 x i8] } +@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8 + +define void @test1(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %t0 = trunc i64 %conv to i16 + %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + %bf.value = and i16 %t0, 255 + %bf.clear = and i16 %bf.load, -256 + %bf.set = or i16 %bf.clear, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + ret void +} + +; class A2 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a2; +; a2.f1 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test2( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8 + +%class.A2 = type { i24, [4 x i8] } +@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8 + +define void @test2(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.clear = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear, %bf.value + store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8 + ret void +} + +; class A3 { +; unsigned long f1:32; +; unsigned long f2:3; +; } a3; +; a3.f1 = n; +; The bitfield store can be shrinked from i64 to i32. +; CHECK-LABEL: @test3( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %bf.value = and i64 %conv, 4294967295 +; CHECK: %trunc = trunc i64 %bf.value to i32 +; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8 + +%class.A3 = type { i40 } +@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8 + +define void @test3(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8 + %bf.value = and i64 %conv, 4294967295 + %bf.clear = and i64 %bf.load, -4294967296 + %bf.set = or i64 %bf.clear, %bf.value + store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8 + ret void +} + +; class A4 { +; unsigned long f1:13; +; unsigned long f2:3; +; } a4; +; a4.f1 = n; +; The bitfield store cannot be shrinked because the field is not 8/16/32 bits. +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: %t0 = trunc i32 %n to i16 +; CHECK-NEXT: %bf.value = and i16 %t0, 8191 +; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192 +; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value +; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: ret void + +%class.A4 = type { i16, [6 x i8] } +@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8 + +define void @test4(i32 %n) { +entry: + %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + %t0 = trunc i32 %n to i16 + %bf.value = and i16 %t0, 8191 + %bf.clear3 = and i16 %bf.load, -8192 + %bf.set = or i16 %bf.clear3, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + ret void +} + +; class A5 { +; unsigned long f1:3; +; unsigned long f2:16; +; } a5; +; a5.f2 = n; +; The bitfield store cannot be shrinked because it is not aligned on +; 16bits boundary. +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: %bf.value = and i32 %n, 65535 +; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3 +; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281 +; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl +; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: ret void + +%class.A5 = type { i24, [4 x i8] } +@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8 + +define void @test5(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 3 + %bf.clear = and i32 %bf.load, -524281 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 + ret void +} + +; class A6 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a6; +; a6.f1 = n; +; The bitfield store can be shrinked from i32 to i16 even the load and store +; are in different BasicBlocks. +; CHECK-LABEL: @test6( +; CHECK: if.end: +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8 + +%class.A6 = type { i24, [4 x i8] } +@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8 + +define void @test6(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8 + %bf.clear = and i32 %bf.load, 65535 + %cmp = icmp eq i32 %bf.clear, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %bf.value = and i32 %n, 65535 + %bf.clear3 = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear3, %bf.value + store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; class A7 { +; unsigned long f1:16; +; unsigned long f2:16; +; } a7; +; a7.f2 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test7( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %bf.shl = shl i32 %bf.value, 16 +; CHECK: %lshr = lshr i32 %bf.shl, 16 +; CHECK: %trunc = trunc i32 %lshr to i16 +; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2 + +%class.A7 = type { i32, [4 x i8] } +@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8 + +define void @test7(i32 %n) { +entry: + %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 16 + %bf.clear = and i32 %bf.load, 65535 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_or(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i8. +; CHECK-LABEL: @i24_and( +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -7 +; CHECK: store i8 %and.trunc, i8* %uglygep, align 1 +; +define void @i24_and(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = and i24 %aa, -1537 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_xor( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %xor.trunc = xor i16 %load.trunc, 384 +; CHECK: store i16 %xor.trunc, i16* %cast, align 1 +; +define void @i24_xor(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = xor i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_and_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_and_or(i24* %a) { + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i24 store to i8. +; CHECK-LABEL: @i24_insert_bit( +; CHECK: %extbit = zext i1 %bit to i24 +; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13 +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %lshr = lshr i24 %extbit.shl, 8 +; CHECK: %trunc = trunc i24 %lshr to i8 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or( +; CHECK: %cast = bitcast i56* %a to i32* +; CHECK: %load.trunc = load i32, i32* %cast, align 1 +; CHECK: %or.trunc = or i32 %load.trunc, 384 +; CHECK: store i32 %or.trunc, i32* %cast, align 1 +; +define void @i56_or(i56* %a) { + %aa = load i56, i56* %a, align 1 + %b = or i56 %aa, 384 + store i56 %b, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_and_or( +; CHECK: %cast = bitcast i56* %a to i32* +; CHECK: %load.trunc = load i32, i32* %cast, align 1 +; CHECK: %and.trunc = and i32 %load.trunc, -128 +; CHECK: %or.trunc = or i32 %and.trunc, 384 +; CHECK: store i32 %or.trunc, i32* %cast, align 1 +; +define void @i56_and_or(i56* %a) { + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store to i8. +; CHECK-LABEL: @i56_insert_bit( +; CHECK: %extbit = zext i1 %bit to i56 +; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13 +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %load.trunc = load i32, i32* %cast1, align 1 +; CHECK: %lshr = lshr i56 %extbit.shl, 8 +; CHECK: %trunc = trunc i56 %lshr to i32 +; CHECK: %and.trunc = and i32 %load.trunc, -33 +; CHECK: %or.trunc = or i32 %and.trunc, %trunc +; CHECK: store i32 %or.trunc, i32* %cast1, align 1 +; +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or_alg2( +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %load.trunc = load i32, i32* %cast1, align 2 +; CHECK: %or.trunc = or i32 %load.trunc, 272 +; CHECK: store i32 %or.trunc, i32* %cast1, align 2 +; +define void @i56_or_alg2(i56* %a) { + %aa = load i56, i56* %a, align 2 + %b = or i56 %aa, 17825792 + store i56 %b, i56* %a, align 2 + ret void +} + + Index: test/CodeGen/X86/2008-09-11-CoalescerBug2.ll =================================================================== --- test/CodeGen/X86/2008-09-11-CoalescerBug2.ll +++ test/CodeGen/X86/2008-09-11-CoalescerBug2.ll @@ -13,9 +13,9 @@ ; SOURCE-SCHED: xorl ; SOURCE-SCHED: cmpl ; SOURCE-SCHED: setg -; SOURCE-SCHED: movb ; SOURCE-SCHED: xorl ; SOURCE-SCHED: subl +; SOURCE-SCHED: movb ; SOURCE-SCHED: testb ; SOURCE-SCHED: jne %0 = load i32, i32* @g_5, align 4 ; [#uses=1] Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc -disable-memaccess-shrinking < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" Index: test/CodeGen/X86/i16lshr8pat.ll =================================================================== --- test/CodeGen/X86/i16lshr8pat.ll +++ test/CodeGen/X86/i16lshr8pat.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s +; RUN: llc -march=x86 -disable-memaccess-shrinking -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -4,13 +4,7 @@ define void @i24_or(i24* %a) { ; CHECK-LABEL: i24_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: orw $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -22,14 +16,9 @@ ; CHECK-LABEL: i24_and_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 @@ -41,17 +30,11 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; CHECK-LABEL: i24_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl (%rdi), %ecx -; CHECK-NEXT: movzbl 2(%rdi), %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shll $13, %eax -; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF -; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movb %dl, 2(%rdi) -; CHECK-NEXT: movw %ax, (%rdi) +; CHECK-NEXT: movb 1(%rdi), %al +; CHECK-NEXT: shlb $5, %sil +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: movb %al, 1(%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 @@ -65,19 +48,7 @@ define void @i56_or(i56* %a) { ; CHECK-LABEL: i56_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movl %edx, (%rdi) -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: movw %dx, 4(%rdi) +; CHECK-NEXT: orw $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 @@ -88,22 +59,10 @@ define void @i56_and_or(i56* %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: orq $384, %rax # imm = 0x180 -; CHECK-NEXT: movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80 -; CHECK-NEXT: andq %rax, %rcx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) -; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: movb %cl, 6(%rdi) +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i56, i56* %a, align 1 %c = and i56 %b, -128 @@ -115,23 +74,11 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl 4(%rdi), %ecx -; CHECK-NEXT: movzbl 6(%rdi), %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shlq $32, %rdx -; CHECK-NEXT: movl (%rdi), %ecx -; CHECK-NEXT: orq %rdx, %rcx -; CHECK-NEXT: shlq $13, %rax -; CHECK-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rdx, %ra -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: shrq $48, %rdx -; CHECK-NEXT: movb %dl, 6(%rdi) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: movb 1(%rdi), %al +; CHECK-NEXT: shlb $5, %sil +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: movb %al, 1(%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 Index: test/CodeGen/X86/load-shrink.ll =================================================================== --- test/CodeGen/X86/load-shrink.ll +++ test/CodeGen/X86/load-shrink.ll @@ -0,0 +1,109 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check bitfield load is shrinked properly in cases below. + +; The bitfield store can be shrinked from i64 to i16. +; CHECK-LABEL: @load_and( +; CHECK: %cast = bitcast i64* %ptr to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 8 +; CHECK: %trunc.zext = zext i16 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = and i64 %bf.load, 65535 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_trunc( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %load.trunc = load i8, i8* %cast, align 8 +; CHECK: %cmp = icmp eq i8 %load.trunc, 1 + +define i1 @load_trunc(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.clear = trunc i64 %bf.load to i8 + %cmp = icmp eq i8 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %trunc.zext = zext i8 %load.trunc to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.lshr = lshr i64 %bf.load, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_and_shr_add( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_and_shr_add(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %add = add i64 %bf.load, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; The bitfield store can be shrinked from i64 to i8. +; CHECK-LABEL: @load_ops( +; CHECK: %cast = bitcast i64* %ptr to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 2 +; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1 +; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext, 1 + +define i1 @load_ops(i64* %ptr, i64 %value) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.value = and i64 %value, 65535 + %bf.clear1 = and i64 %bf.load, -65536 + %bf.set = or i64 %bf.value, %bf.clear1 + %add = add i64 %bf.set, 281474976710656 + %bf.lshr = lshr i64 %add, 48 + %bf.clear = and i64 %bf.lshr, 255 + %cmp = icmp eq i64 %bf.clear, 1 + ret i1 %cmp +} + +; It doesn't worth to do the shrink because %bf.load has multiple uses +; and the shrink here doesn't save instructions. +; CHECK-LABEL: @load_trunc_multiuses +; CHECK: %bf.load = load i64, i64* %ptr, align 8 +; CHECK: %bf.trunc = trunc i64 %bf.load to i16 +; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3 +; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000 +; CHECK: %cmp = and i1 %cmp1, %cmp2 + +define i1 @load_trunc_multiuses(i64* %ptr) { +entry: + %bf.load = load i64, i64* %ptr, align 8 + %bf.trunc = trunc i64 %bf.load to i16 + %cmp1 = icmp eq i16 %bf.trunc, 3 + %cmp2 = icmp ult i64 %bf.load, 1500000 + %cmp = and i1 %cmp1, %cmp2 + ret i1 %cmp +} + Index: test/CodeGen/X86/load-slice.ll =================================================================== --- test/CodeGen/X86/load-slice.ll +++ test/CodeGen/X86/load-slice.ll @@ -110,20 +110,23 @@ ret i32 %res } -; Check that we do not optimize overlapping slices. ; -; The 64-bits should NOT have been split in as slices are overlapping. +; The slices are overlapping, but it can still be split. ; First slice uses bytes numbered 0 to 3. ; Second slice uses bytes numbered 6 and 7. ; Third slice uses bytes numbered 4 to 7. ; ; STRESS-LABEL: t3: -; STRESS: shrq $48 -; STRESS: shrq $32 +; STRESS: movzwl 6(%rdi,%rsi,8), %eax +; STRESS-NEXT: addl (%rdi,%rsi,8), %eax +; STRESS-NEXT: addl 4(%rdi,%rsi,8), %eax ; ; REGULAR-LABEL: t3: -; REGULAR: shrq $48 -; REGULAR: shrq $32 +; REGULAR: movq (%rdi,%rsi,8), %rcx +; REGULAR-NEXT: movq %rcx, %rax +; REGULAR-NEXT: shrq $48, %rax +; REGULAR-NEXT: addl %ecx, %eax +; REGULAR-NEXT: addl 4(%rdi,%rsi,8), %eax define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) { %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start %bitcast = bitcast %class.Complex* %arrayidx to i64* Index: test/CodeGen/X86/lsr-loop-exit-cond.ll =================================================================== --- test/CodeGen/X86/lsr-loop-exit-cond.ll +++ test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -2,14 +2,16 @@ ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; CHECK-LABEL: t: +; CHECK: [[LABEL:LBB.*]]: ; CHECK: movl (%r9,%rax,4), %e{{..}} -; CHECK-NEXT: testq -; CHECK-NEXT: jne +; CHECK: testq +; CHECK: jne [[LABEL]] ; ATOM-LABEL: t: +; ATOM: [[LABEL:LBB.*]]: ; ATOM: movl (%r9,%r{{.+}},4), %e{{..}} -; ATOM-NEXT: testq -; ATOM-NEXT: jne +; ATOM: testq +; ATOM: jne [[LABEL]] @Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5] @Te1 = external global [256 x i32] ; <[256 x i32]*> [#uses=4] Index: test/CodeGen/X86/mem-access-shrink.ll =================================================================== --- test/CodeGen/X86/mem-access-shrink.ll +++ test/CodeGen/X86/mem-access-shrink.ll @@ -0,0 +1,216 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check the memory accesses in the test are shrinked as expected. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%class.B = type { i56, [4 x i16], i64 } +@i = local_unnamed_addr global i64 0, align 8 + +define zeroext i1 @foo(%class.B* nocapture %this, i8* %p1, i64 %p2) local_unnamed_addr align 2 { +; CHECK-LABEL: @foo( +entry: +; CHECK: entry: +; CHECK: %t0 = bitcast %class.B* %this to i64* +; CHECK: %cast36 = bitcast i64* %t0 to i16* +; CHECK: %load.trunc37 = load i16, i16* %cast36, align 8 +; CHECK: %trunc.zext38 = zext i16 %load.trunc37 to i64 +; CHECK: %cmp = icmp eq i64 %trunc.zext38, 1 + + %t0 = bitcast %class.B* %this to i64* + %bf.load = load i64, i64* %t0, align 8 + %bf.clear = and i64 %bf.load, 65535 + %cmp = icmp eq i64 %bf.clear, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry +; CHECK: if.end: +; CHECK: %cast35 = bitcast i64* %t0 to i16* +; CHECK: %load.trunc = load i16, i16* %cast35, align 8 +; CHECK: %[[ADD1:.*]] = add i16 %load.trunc, -1 +; CHECK: %trunc.zext = zext i16 %[[ADD1]] to i64 +; CHECK: %cast33 = bitcast i64* %t0 to i16* +; CHECK: %trunc34 = trunc i64 %trunc.zext to i16 +; CHECK: store i16 %trunc34, i16* %cast33, align 8 + + %dec = add i64 %bf.load, 65535 + %bf.value = and i64 %dec, 65535 + %bf.clear5 = and i64 %bf.load, -65536 + %bf.set = or i64 %bf.value, %bf.clear5 + store i64 %bf.set, i64* %t0, align 8 + %t1 = ptrtoint i8* %p1 to i64 + %t2 = load i64, i64* @i, align 8 + %cmp.i = icmp ult i64 %t2, 3 + br i1 %cmp.i, label %if.then.i, label %if.else.i + +if.then.i: ; preds = %if.end + %and.i = lshr i64 %t1, 3 + %div.i = and i64 %and.i, 1023 + br label %_ZNK1B5m_fn3EPv.exit + +if.else.i: ; preds = %if.end + %first_page_.i = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2 + %t3 = load i64, i64* %first_page_.i, align 8 + %mul.i = shl i64 %t3, 13 + %sub.i = sub i64 %t1, %mul.i + %div2.i = lshr i64 %sub.i, 2 + br label %_ZNK1B5m_fn3EPv.exit + +_ZNK1B5m_fn3EPv.exit: ; preds = %if.then.i, %if.else.i +; CHECK: _ZNK1B5m_fn3EPv.exit: +; CHECK: %cast76 = bitcast i64* %t0 to i8* +; CHECK: %uglygep77 = getelementptr i8, i8* %cast76, i32 6 +; CHECK: %load.trunc78 = load i8, i8* %uglygep77, align 2 +; CHECK: %trunc.zext79 = zext i8 %load.trunc78 to i64 +; CHECK: %cmp8 = icmp eq i64 %trunc.zext79, 4 + + %j.0.i = phi i64 [ %div.i, %if.then.i ], [ %div2.i, %if.else.i ] + %conv.i = trunc i64 %j.0.i to i16 + %bf.lshr = lshr i64 %bf.load, 48 + %bf.clear7 = and i64 %bf.lshr, 255 + %cmp8 = icmp eq i64 %bf.clear7, 4 + br i1 %cmp8, label %if.else, label %if.then9 + +if.then9: ; preds = %_ZNK1B5m_fn3EPv.exit +; CHECK: if.then9: +; CHECK: %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %trunc.zext79 +; CHECK: store i16 %conv.i, i16* %arrayidx, align 2 +; CHECK: %cast25 = bitcast i64* %t0 to i8* +; CHECK: %uglygep26 = getelementptr i8, i8* %cast25, i32 6 +; CHECK: %cast71 = bitcast i64* %t0 to i8* +; CHECK: %uglygep72 = getelementptr i8, i8* %cast71, i32 6 +; CHECK: %load.trunc73 = load i8, i8* %uglygep72, align 2 +; CHECK: %[[ADD2:.*]] = add i8 %load.trunc73, 1 +; CHECK: %[[AND1:.*]] = and i8 %[[ADD2]], -1 +; CHECK: store i8 %[[AND1]], i8* %uglygep26, align 2 + + %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %bf.clear7 + store i16 %conv.i, i16* %arrayidx, align 2 + %inc79 = add i64 %bf.set, 281474976710656 + %bf.shl = and i64 %inc79, 71776119061217280 + %bf.clear18 = and i64 %bf.set, -71776119061217281 + %bf.set19 = or i64 %bf.shl, %bf.clear18 + store i64 %bf.set19, i64* %t0, align 8 + br label %return + +if.else: ; preds = %_ZNK1B5m_fn3EPv.exit +; CHECK: if.else: +; CHECK: %cast66 = bitcast i64* %t0 to i8* +; CHECK: %uglygep67 = getelementptr i8, i8* %cast66, i32 4 +; CHECK: %cast68 = bitcast i8* %uglygep67 to i16* +; CHECK: %load.trunc69 = load i16, i16* %cast68, align 4 +; CHECK: %trunc.zext70 = zext i16 %load.trunc69 to i64 +; CHECK: %cmp24 = icmp eq i64 %trunc.zext70, 1 + + %bf.lshr21 = lshr i64 %bf.load, 32 + %bf.clear22 = and i64 %bf.lshr21, 65535 + %cmp24 = icmp eq i64 %bf.clear22, 1 + br i1 %cmp24, label %if.else55, label %land.lhs.true + +land.lhs.true: ; preds = %if.else +; CHECK: land.lhs.true: +; CHECK: %cast61 = bitcast i64* %t0 to i8* +; CHECK: %uglygep62 = getelementptr i8, i8* %cast61, i32 2 +; CHECK: %cast63 = bitcast i8* %uglygep62 to i16* +; CHECK: %load.trunc64 = load i16, i16* %cast63, align 2 +; CHECK: %trunc.zext65 = zext i16 %load.trunc64 to i64 +; CHECK: %cmp28 = icmp eq i64 %trunc.zext65, %sub + + %bf.lshr26 = lshr i64 %bf.load, 16 + %bf.clear27 = and i64 %bf.lshr26, 65535 + %div = lshr i64 %p2, 1 + %sub = add nsw i64 %div, -1 + %cmp28 = icmp eq i64 %bf.clear27, %sub + br i1 %cmp28, label %if.else55, label %if.then29 + +if.then29: ; preds = %land.lhs.true + %cmp30 = icmp ult i64 %p2, 3 + br i1 %cmp30, label %if.then31, label %if.else35 + +if.then31: ; preds = %if.then29 +; CHECK: if.then31: +; CHECK: %mul = shl nuw nsw i64 %trunc.zext70, 3 + + %and = and i64 %t1, -8192 + %mul = shl nuw nsw i64 %bf.clear22, 3 + %add = add i64 %mul, %and + br label %if.end41 + +if.else35: ; preds = %if.then29 +; CHECK: if.else35: +; CHECK: %cast56 = bitcast i64* %t0 to i8* +; CHECK: %uglygep57 = getelementptr i8, i8* %cast56, i32 4 +; CHECK: %cast58 = bitcast i8* %uglygep57 to i16* +; CHECK: %load.trunc59 = load i16, i16* %cast58, align 4 +; CHECK: %trunc.zext60 = zext i16 %load.trunc59 to i64 +; CHECK: %shl = shl i64 %trunc.zext60, 6 + + %first_page_.i80 = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2 + %t4 = load i64, i64* %first_page_.i80, align 8 + %mul.i81 = shl i64 %t4, 13 + %conv.i82 = shl nuw nsw i64 %bf.lshr21, 6 + %mul3.i = and i64 %conv.i82, 4194240 + %add.i = add i64 %mul.i81, %mul3.i + br label %if.end41 + +if.end41: ; preds = %if.else35, %if.then31 +; CHECK: if.end41: +; CHECK: %cast8 = bitcast i64* %t0 to i8* +; CHECK: %uglygep9 = getelementptr i8, i8* %cast8, i32 2 +; CHECK: %cast10 = bitcast i8* %uglygep9 to i16* +; CHECK: %cast84 = bitcast i64* %t0 to i8* +; CHECK: %uglygep85 = getelementptr i8, i8* %cast84, i32 2 +; CHECK: %cast86 = bitcast i8* %uglygep85 to i16* +; CHECK: %load.trunc87 = load i16, i16* %cast86, align 2 +; CHECK: %[[ADD3:.*]] = add i16 %load.trunc87, 1 +; CHECK: %[[AND2:.*]] = and i16 %[[ADD3]], -1 +; CHECK: %cast45 = bitcast i64* %t0 to i8* +; CHECK: %uglygep46 = getelementptr i8, i8* %cast45, i32 2 +; CHECK: %cast47 = bitcast i8* %uglygep46 to i16* +; CHECK: %load.trunc48 = load i16, i16* %cast47, align 2 +; CHECK: %[[ADD4:.*]] = add i16 %load.trunc48, 1 +; CHECK: %trunc.zext49 = zext i16 %[[ADD4]] to i64 +; CHECK: store i16 %[[AND2]], i16* %cast10, align 2 +; CHECK: %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %trunc.zext49 + + %add.i.sink = phi i64 [ %add.i, %if.else35 ], [ %add, %if.then31 ] + %t5 = inttoptr i64 %add.i.sink to i16* + %inc4578 = add i64 %bf.set, 65536 + %bf.shl48 = and i64 %inc4578, 4294901760 + %bf.clear49 = and i64 %bf.set, -4294901761 + %bf.set50 = or i64 %bf.shl48, %bf.clear49 + store i64 %bf.set50, i64* %t0, align 8 + %bf.lshr52 = lshr i64 %inc4578, 16 + %bf.clear53 = and i64 %bf.lshr52, 65535 + %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %bf.clear53 + store i16 %conv.i, i16* %arrayidx54, align 2 + br label %return + +if.else55: ; preds = %land.lhs.true, %if.else +; CHECK: if.else55: +; CHECK: %cast80 = bitcast i64* %t0 to i8* +; CHECK: %uglygep81 = getelementptr i8, i8* %cast80, i32 4 +; CHECK: %cast82 = bitcast i8* %uglygep81 to i16* +; CHECK: %load.trunc83 = load i16, i16* %cast82, align 4 +; CHECK: store i16 %load.trunc83, i16* %t6, align 2 +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %lshr = lshr i64 %bf.shl63, 16 +; CHECK: %trunc = trunc i64 %lshr to i32 +; CHECK: store i32 %trunc, i32* %cast1, align 2 + + %conv59 = trunc i64 %bf.lshr21 to i16 + %t6 = bitcast i8* %p1 to i16* + store i16 %conv59, i16* %t6, align 2 + %bf.load61 = load i64, i64* %t0, align 8 + %conv60 = shl i64 %j.0.i, 32 + %bf.shl63 = and i64 %conv60, 281470681743360 + %bf.clear64 = and i64 %bf.load61, -281474976645121 + %bf.clear67 = or i64 %bf.clear64, %bf.shl63 + store i64 %bf.clear67, i64* %t0, align 8 + br label %return + +return: ; preds = %if.then9, %if.else55, %if.end41, %entry + %retval.0 = phi i1 [ false, %entry ], [ true, %if.end41 ], [ true, %if.else55 ], [ true, %if.then9 ] + ret i1 %retval.0 +} + Index: test/CodeGen/X86/store-shrink.ll =================================================================== --- test/CodeGen/X86/store-shrink.ll +++ test/CodeGen/X86/store-shrink.ll @@ -0,0 +1,363 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s +; Check bitfield store is shrinked properly in cases below. + +; class A1 { +; unsigned long f1:8; +; unsigned long f2:3; +; } a1; +; a1.f1 = n; +; +; The bitfield store can be shrinked from i16 to i8. +; CHECK-LABEL: @test1( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %t0 = trunc i64 %conv to i16 +; CHECK: %bf.value = and i16 %t0, 255 +; CHECK: %trunc = trunc i16 %bf.value to i8 +; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8 + +%class.A1 = type { i16, [6 x i8] } +@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8 + +define void @test1(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %t0 = trunc i64 %conv to i16 + %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + %bf.value = and i16 %t0, 255 + %bf.clear = and i16 %bf.load, -256 + %bf.set = or i16 %bf.clear, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + ret void +} + +; class A2 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a2; +; a2.f1 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test2( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8 + +%class.A2 = type { i24, [4 x i8] } +@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8 + +define void @test2(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.clear = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear, %bf.value + store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8 + ret void +} + +; class A3 { +; unsigned long f1:32; +; unsigned long f2:3; +; } a3; +; a3.f1 = n; +; The bitfield store can be shrinked from i64 to i32. +; CHECK-LABEL: @test3( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %bf.value = and i64 %conv, 4294967295 +; CHECK: %trunc = trunc i64 %bf.value to i32 +; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8 + +%class.A3 = type { i40 } +@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8 + +define void @test3(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8 + %bf.value = and i64 %conv, 4294967295 + %bf.clear = and i64 %bf.load, -4294967296 + %bf.set = or i64 %bf.clear, %bf.value + store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8 + ret void +} + +; class A4 { +; unsigned long f1:13; +; unsigned long f2:3; +; } a4; +; a4.f1 = n; +; The bitfield store cannot be shrinked because the field is not 8/16/32 bits. +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: %t0 = trunc i32 %n to i16 +; CHECK-NEXT: %bf.value = and i16 %t0, 8191 +; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192 +; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value +; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: ret void + +%class.A4 = type { i16, [6 x i8] } +@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8 + +define void @test4(i32 %n) { +entry: + %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + %t0 = trunc i32 %n to i16 + %bf.value = and i16 %t0, 8191 + %bf.clear3 = and i16 %bf.load, -8192 + %bf.set = or i16 %bf.clear3, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + ret void +} + +; class A5 { +; unsigned long f1:3; +; unsigned long f2:16; +; } a5; +; a5.f2 = n; +; The bitfield store cannot be shrinked because it is not aligned on +; 16bits boundary. +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: %bf.value = and i32 %n, 65535 +; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3 +; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281 +; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl +; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: ret void + +%class.A5 = type { i24, [4 x i8] } +@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8 + +define void @test5(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 3 + %bf.clear = and i32 %bf.load, -524281 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 + ret void +} + +; class A6 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a6; +; a6.f1 = n; +; The bitfield store can be shrinked from i32 to i16 even the load and store +; are in different BasicBlocks. +; CHECK-LABEL: @test6( +; CHECK: if.end: +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8 + +%class.A6 = type { i24, [4 x i8] } +@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8 + +define void @test6(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8 + %bf.clear = and i32 %bf.load, 65535 + %cmp = icmp eq i32 %bf.clear, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %bf.value = and i32 %n, 65535 + %bf.clear3 = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear3, %bf.value + store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; class A7 { +; unsigned long f1:16; +; unsigned long f2:16; +; } a7; +; a7.f2 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test7( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %bf.shl = shl i32 %bf.value, 16 +; CHECK: %lshr = lshr i32 %bf.shl, 16 +; CHECK: %trunc = trunc i32 %lshr to i16 +; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2 + +%class.A7 = type { i32, [4 x i8] } +@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8 + +define void @test7(i32 %n) { +entry: + %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 16 + %bf.clear = and i32 %bf.load, 65535 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_or(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i8. +; CHECK-LABEL: @i24_and( +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -7 +; CHECK: store i8 %and.trunc, i8* %uglygep, align 1 +; +define void @i24_and(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = and i24 %aa, -1537 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_xor( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %xor.trunc = xor i16 %load.trunc, 384 +; CHECK: store i16 %xor.trunc, i16* %cast, align 1 +; +define void @i24_xor(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = xor i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_and_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_and_or(i24* %a) { + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i24 store to i8. +; CHECK-LABEL: @i24_insert_bit( +; CHECK: %extbit = zext i1 %bit to i24 +; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13 +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %lshr = lshr i24 %extbit.shl, 8 +; CHECK: %trunc = trunc i24 %lshr to i8 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or( +; CHECK: %cast = bitcast i56* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i56_or(i56* %a) { + %aa = load i56, i56* %a, align 1 + %b = or i56 %aa, 384 + store i56 %b, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_and_or( +; CHECK: %cast = bitcast i56* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i56_and_or(i56* %a) { + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store to i8. +; CHECK-LABEL: @i56_insert_bit( +; CHECK: %extbit = zext i1 %bit to i56 +; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13 +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %lshr = lshr i56 %extbit.shl, 8 +; CHECK: %trunc = trunc i56 %lshr to i8 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or_alg2( +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i16* +; CHECK: %load.trunc = load i16, i16* %cast1, align 2 +; CHECK: %or.trunc = or i16 %load.trunc, 272 +; CHECK: store i16 %or.trunc, i16* %cast1, align 2 +; +define void @i56_or_alg2(i56* %a) { + %aa = load i56, i56* %a, align 2 + %b = or i56 %aa, 17825792 + store i56 %b, i56* %a, align 2 + ret void +} + + Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -380,6 +380,7 @@ initializeTarget(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. + initializeMemAccessShrinkingPassPass(Registry); initializeCodeGenPreparePass(Registry); initializeAtomicExpandPass(Registry); initializeRewriteSymbolsLegacyPassPass(Registry);