Index: include/llvm/Analysis/MemorySSAUpdater.h
===================================================================
--- include/llvm/Analysis/MemorySSAUpdater.h
+++ include/llvm/Analysis/MemorySSAUpdater.h
@@ -134,6 +134,9 @@
   /// load is simply erased (not replaced), removeMemoryAccess should be called
   /// on the MemoryAccess for that store/load.
   void removeMemoryAccess(MemoryAccess *);
+  /// If the instruction has a MemoryAccess associated with it, remove
+  /// the MemoryAccess from MemorySSA.
+  void removeMemoryAccess(Instruction *);
 
 private:
   // Move What before Where in the MemorySSA IR.
Index: include/llvm/CodeGen/Passes.h
===================================================================
--- include/llvm/CodeGen/Passes.h
+++ include/llvm/CodeGen/Passes.h
@@ -64,6 +64,9 @@
   MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag,
                                                       bool AbortOnFailedISel);
 
+  /// This pass shrinks some load/store to narrower legal type accesses.
+  FunctionPass *createMemAccessShrinkingPass(const TargetMachine *TM = nullptr);
+
   /// createCodeGenPreparePass - Transform the code to expose more pattern
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
Index: include/llvm/IR/PatternMatch.h
===================================================================
--- include/llvm/IR/PatternMatch.h
+++ include/llvm/IR/PatternMatch.h
@@ -317,6 +317,9 @@
 /// \brief Match a ConstantInt, capturing the value if we match.
 inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 
+/// \brief Match a load instruction, capturing the value if we match.
+inline bind_ty<LoadInst> m_Load(LoadInst *&LI) { return LI; }
+
 /// \brief Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -246,6 +246,7 @@
 void initializeMachineSinkingPass(PassRegistry&);
 void initializeMachineTraceMetricsPass(PassRegistry&);
 void initializeMachineVerifierPassPass(PassRegistry&);
+void initializeMemAccessShrinkingPassPass(PassRegistry &);
 void initializeMemCpyOptLegacyPassPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
 void initializeMemDerefPrinterPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -168,6 +168,7 @@
       (void) llvm::createCountingFunctionInserterPass();
       (void) llvm::createEarlyCSEPass();
       (void) llvm::createGVNHoistPass();
+      (void) llvm::createMemAccessShrinkingPass();
       (void) llvm::createMergedLoadStoreMotionPass();
       (void) llvm::createGVNPass();
       (void) llvm::createNewGVNPass();
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1965,6 +1965,14 @@
     return false;
   }
 
+  /// Return true if it's expensive to narrow operations of type VT1 to
+  /// VT2. Sometimes we want to try load/store shrinking to save extra
+  /// bit manipulation operations. We need to make sure the shrinking will
+  /// not be so expensive to offset the benefit.
+  virtual bool isNarrowingExpensive(EVT /*VT1*/, EVT /*VT2*/) const {
+    return false;
+  }
+
   /// \brief Return true if it is beneficial to convert a load of a constant to
   /// just the constant itself.
   /// On some targets it might be more efficient to use a combination of
Index: include/llvm/Transforms/Utils/Local.h
===================================================================
--- include/llvm/Transforms/Utils/Local.h
+++ include/llvm/Transforms/Utils/Local.h
@@ -15,13 +15,14 @@
 #ifndef LLVM_TRANSFORMS_UTILS_LOCAL_H
 #define LLVM_TRANSFORMS_UTILS_LOCAL_H
 
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 namespace llvm {
 
@@ -81,8 +82,9 @@
 /// If the specified value is a trivially dead instruction, delete it.
 /// If that makes any of its operands trivially dead, delete them too,
 /// recursively. Return true if any instructions were deleted.
-bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
-                                        const TargetLibraryInfo *TLI = nullptr);
+bool RecursivelyDeleteTriviallyDeadInstructions(
+    Value *V, const TargetLibraryInfo *TLI = nullptr,
+    MemorySSAUpdater *MSSAUpdater = nullptr);
 
 /// If the specified value is an effectively dead PHI node, due to being a
 /// def-use chain of single-use nodes that either forms a cycle or is terminated
Index: lib/Analysis/MemorySSAUpdater.cpp
===================================================================
--- lib/Analysis/MemorySSAUpdater.cpp
+++ lib/Analysis/MemorySSAUpdater.cpp
@@ -464,6 +464,12 @@
   MSSA->removeFromLists(MA);
 }
 
+void MemorySSAUpdater::removeMemoryAccess(Instruction *I) {
+  MemoryAccess *MemAcc = MSSA->getMemoryAccess(I);
+  if (MemAcc)
+    removeMemoryAccess(MemAcc);
+}
+
 MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB(
     Instruction *I, MemoryAccess *Definition, const BasicBlock *BB,
     MemorySSA::InsertionPlace Point) {
Index: lib/CodeGen/CMakeLists.txt
===================================================================
--- lib/CodeGen/CMakeLists.txt
+++ lib/CodeGen/CMakeLists.txt
@@ -86,6 +86,7 @@
   MachineSSAUpdater.cpp
   MachineTraceMetrics.cpp
   MachineVerifier.cpp
+  MemAccessShrinking.cpp
   PatchableFunction.cpp
   MIRPrinter.cpp
   MIRPrintingPass.cpp
Index: lib/CodeGen/CodeGen.cpp
===================================================================
--- lib/CodeGen/CodeGen.cpp
+++ lib/CodeGen/CodeGen.cpp
@@ -66,6 +66,7 @@
   initializeMachineSchedulerPass(Registry);
   initializeMachineSinkingPass(Registry);
   initializeMachineVerifierPassPass(Registry);
+  initializeMemAccessShrinkingPassPass(Registry);
   initializeOptimizePHIsPass(Registry);
   initializePEIPass(Registry);
   initializePHIEliminationPass(Registry);
Index: lib/CodeGen/MemAccessShrinking.cpp
===================================================================
--- lib/CodeGen/MemAccessShrinking.cpp
+++ lib/CodeGen/MemAccessShrinking.cpp
@@ -0,0 +1,1189 @@
+//===- MemAccessShrinking.cpp - Shrink the type of mem accesses -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// The pass tries to find opportunities where we can save bit manipulation
+/// operations or convert illegal type operations to legal type by shrinking
+/// the type of memory accesses.
+///
+/// A major motivation of the optimization:
+/// Consecutive bitfields are wrapped as a group and represented as a
+/// large integer. To access a specific bitfield, wide load/store plus a
+/// series of bit operations are needed. However, if the bitfield is of legal
+/// type, it is more efficient to access it directly. Another case is the
+/// bitfield group may be of illegal integer type, if we only access one
+/// bitfield in the group, we may have chance to shrink the illegal type
+/// to a smaller legal type, and save some bit mask operations.
+/// Although the major motivation of the optimization is about bitfield
+/// accesses, it is not limited to them and can be applied to general memory
+/// accesses with similar patterns.
+///
+/// Description about the optimization:
+/// The optimization have three basic categories. The first category is store
+/// shrinking. If we are storing a large value, with some of its consecutive
+/// bits replaced by a small value through bit manipulation, we can use two
+/// stores, one for the large value and one for the small, to replace the
+/// original store and bit manipulation operations. Even better if the large
+/// value is got from a load with the same address as the store, the store
+/// to the large value can be saved too.
+///
+/// The second category is another kind of store shrinking. We are looking
+/// for the operations sequence like: load a large value, adjust some bits
+/// of the value and then store it back to the same address as the load.
+/// The essence of the operation sequence is to change some bits of the
+/// large value but keep the rest part of it the same. We may shrink the
+/// size of both the load and store, and the bit manipulation operations
+/// in the middle of them, if only the bits to be changed are still included.
+/// This is especially useful when the load and store are of illegal type.
+/// Shrinking the load and store to a smaller legal type can save us many
+/// extra operations to be generated in ISel legalization.
+///
+/// The third category is load use shrinking. If we are loading a large
+/// value and using it to do a series of operations, but finally we only
+/// use a small part of the result value, we can shrink the size of the
+/// whole chain including the load.
+///
+/// Some notes about the algorithm:
+/// * The algorithm scans the program and tries to recognize and transform the
+///   patterns above. It runs iteratively until no more changes have happened.
+/// * To prevent the optimization from blocking load/store coalescing, it is
+///   invoked late in the pipeline, just before CodeGenPrepare. At this late
+///   stage, both the pattern matching and related safety check become more
+///   difficult because of previous optimizations introducing merged loads and
+///   stores and more complex control flow. That is why MemorySSA is used here.
+///   It provides scalable and precise safety checks even when we try to insert
+///   a smaller access into a block which is many blocks away from the original
+///   access.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "memaccessshrink"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumStoreShrunkBySplit, "Number of stores shrunk by splitting");
+STATISTIC(NumStoreShrunkToLegal, "Number of stores shrunk to legal types");
+STATISTIC(NumLoadShrunk, "Number of Loads shrunk");
+
+namespace {
+/// Describe the bit range of a value.
+struct BitRange {
+  unsigned Shift;
+  unsigned Width;
+  bool isDisjoint(BitRange &BR);
+};
+
+/// \brief The memory access shrinking pass.
+class MemAccessShrinkingPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MemAccessShrinkingPass(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {
+    initializeMemAccessShrinkingPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  StringRef getPassName() const override { return "MemAccess Shrinking"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  struct StoreShrunkInfo {
+    StoreInst &SI;
+    Value *LargeVal;
+    Value *SmallVal;
+    ConstantInt *Cst;
+    BitRange &BR;
+    StoreShrunkInfo(StoreInst &SI, Value *LargeVal, Value *SmallVal,
+                    ConstantInt *Cst, BitRange &BR)
+        : SI(SI), LargeVal(LargeVal), SmallVal(SmallVal), Cst(Cst), BR(BR) {}
+  };
+
+private:
+  const DataLayout *DL = nullptr;
+  const DominatorTree *DT = nullptr;
+  const TargetMachine *TM = nullptr;
+  const TargetLowering *TLI = nullptr;
+  const TargetLibraryInfo *TLInfo = nullptr;
+  LLVMContext *Context;
+
+  MemorySSA *MSSA;
+  MemorySSAWalker *MSSAWalker;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+
+  // Candidate instructions to be erased if they have no other uses.
+  SmallPtrSet<Instruction *, 8> CandidatesToErase;
+
+  // MultiUsesSeen shows a multiuse node is found on the Def-Use Chain.
+  bool MultiUsesSeen;
+  // SavedInsts shows how many instructions saved for a specific shrink so far.
+  // It is used to evaluate whether some instructions will be saved for the
+  // shrinking especially when MultiUsesSeen is true.
+  unsigned SavedInsts;
+
+  BitRange analyzeOrAndPattern(Value &SmallVal, ConstantInt &Cst,
+                               unsigned BitSize, bool &DoReplacement);
+  BitRange analyzeBOpPattern(Value &Val, ConstantInt &Cst, unsigned BitSize);
+  bool extendBitRange(BitRange &BR, unsigned BitSize, unsigned Align);
+  bool hasClobberBetween(Instruction &From, Instruction &To);
+  bool needNewStore(Value &LargeVal, StoreInst &SI);
+  Value *createNewPtr(Value *Ptr, unsigned StOffset, Type *NewTy,
+                      IRBuilder<> &Builder);
+
+  bool findBRWithLegalType(StoreInst &SI, BitRange &BR);
+  bool isLegalToShrinkStore(LoadInst &LI, StoreInst &SI);
+  bool tryShrinkStoreBySplit(StoreInst &SI);
+  bool tryShrinkStoreToLegalTy(StoreInst &SI);
+  bool splitStoreIntoTwo(StoreShrunkInfo &SInfo);
+  bool shrinkStoreToLegalType(StoreShrunkInfo &SInfo);
+  bool tryShrinkLoadUse(Instruction &IN);
+  bool tryShrink(Function &Fn);
+  Value *shrinkInsts(Value *NewPtr, BitRange &BR,
+                     SmallVectorImpl<Instruction *> &Insts,
+                     bool AllClobberInToBlock, unsigned ResShift,
+                     IRBuilder<> &Builder);
+  bool matchInstsToShrink(Value *Val, BitRange &BR,
+                          SmallVectorImpl<Instruction *> &Insts);
+  bool hasClobberBetween(Instruction &From, Instruction &To,
+                         const MemoryLocation &Mem, bool &AllClobberInToBlock);
+  void addSaveCandidate(Instruction &Inst, bool ReplaceAllUses = false);
+  bool isBeneficial(unsigned ResShift, SmallVectorImpl<Instruction *> &Insts);
+  void markCandForDeletion(Instruction &I) { CandidatesToErase.insert(&I); }
+  bool removeDeadInsts();
+};
+} // namespace
+
+char MemAccessShrinkingPass::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(MemAccessShrinkingPass, "memaccessshrink",
+                         "MemAccess Shrinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_TM_PASS_END(MemAccessShrinkingPass, "memaccessshrink",
+                       "MemAccess Shrinking", false, false)
+
+FunctionPass *llvm::createMemAccessShrinkingPass(const TargetMachine *TM) {
+  return new MemAccessShrinkingPass(TM);
+}
+
+/// Analyze pattern or(and(LargeVal, \p Cst), \p SmallVal). A common usage
+/// of the pattern is to replace some consecutive bits in LargeVal with
+/// nonzero bits of SmallVal, but to do the replacement as described,
+/// some requirements have to be satisfied like that \p Cst has to contain
+/// a consecutive run of zeros, and that all the nonzero bits of \p SmallVal
+/// should be inside the range of consecutive zeros bits of \p Cst.
+/// \p DoReplacement will contain the result of whether the requirements are
+/// satisfied. We will also return BitRange describing the maximum range of
+/// LargeVal that may be modified.
+BitRange MemAccessShrinkingPass::analyzeOrAndPattern(Value &SmallVal,
+                                                     ConstantInt &Cst,
+                                                     unsigned BitSize,
+                                                     bool &DoReplacement) {
+  // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We
+  // will handle patterns like either 0..01..1 or 1..10..01..1
+  APInt Mask = Cst.getValue().sextOrTrunc(BitSize);
+  assert(Mask.getBitWidth() == BitSize && "Unexpected bitwidth of Mask");
+  unsigned MaskLeadOnes = Mask.countLeadingOnes();
+  unsigned MaskTrailOnes = Mask.countTrailingOnes();
+  unsigned MaskMidZeros = !MaskLeadOnes
+                              ? Mask.countLeadingZeros()
+                              : Mask.ashr(MaskTrailOnes).countTrailingZeros();
+  // Replace some consecutive bits in LargeVal with all the bits of SmallVal.
+  DoReplacement = true;
+  if (MaskLeadOnes == BitSize) {
+    MaskMidZeros = 0;
+    DoReplacement = false;
+  } else if (MaskLeadOnes + MaskMidZeros + MaskTrailOnes != BitSize) {
+    // See if we have a continuous run of zeros.
+    MaskMidZeros = BitSize - MaskLeadOnes - MaskTrailOnes;
+    DoReplacement = false;
+  }
+
+  // Check SmallVal only provides nonzero bits within range from lowbits
+  // (MaskTrailOnes) to highbits (MaskTrailOnes + MaskMidZeros).
+  APInt BitMask =
+      ~APInt::getBitsSet(BitSize, MaskTrailOnes, MaskTrailOnes + MaskMidZeros);
+
+  // Find out the range in which 1 appears in SmallVal.
+  APInt KnownOne(BitSize, 0), KnownZero(BitSize, 0);
+  computeKnownBits(&SmallVal, KnownZero, KnownOne, *DL, 0);
+
+  BitRange BR;
+  BR.Shift = MaskTrailOnes;
+  BR.Width = MaskMidZeros;
+  // Expect the bits being 1 in BitMask are all KnownZero bits in SmallVal,
+  // otherwise we need to set ShrinkWithMaskedVal to false and expand BR.
+  if ((KnownZero & BitMask) != BitMask) {
+    DoReplacement = false;
+    // Lower is the first bit for which we are not sure about the fact of
+    // its being zero.
+    unsigned Lower = KnownZero.countTrailingOnes();
+    // Higher is the last bit for which we are not sure about the fact of
+    // its being zero.
+    unsigned Higher = BitSize - KnownZero.countLeadingOnes();
+    BR.Shift = std::min(Lower, MaskTrailOnes);
+    BR.Width = std::max(Higher, MaskTrailOnes + MaskMidZeros) - BR.Shift;
+  }
+  return BR;
+}
+
+/// Analyze pattern or/xor/and(load P, \p Cst). The result of the pattern
+/// is a partially changed load P. The func will return the the range
+/// showing where the load result are changed.
+BitRange MemAccessShrinkingPass::analyzeBOpPattern(Value &Val, ConstantInt &Cst,
+                                                   unsigned BitSize) {
+  APInt Mask = Cst.getValue().sextOrTrunc(BitSize);
+  BinaryOperator *BOP = cast<BinaryOperator>(&Val);
+  if (BOP->getOpcode() == Instruction::And)
+    Mask = ~Mask;
+
+  BitRange BR;
+  BR.Shift = Mask.countTrailingZeros();
+  BR.Width = Mask.getBitWidth() - BR.Shift;
+  if (BR.Width)
+    BR.Width = BR.Width - Mask.countLeadingZeros();
+  return BR;
+}
+
+/// Extend \p BR so the new BR.Width bits can form a legal type.
+bool MemAccessShrinkingPass::extendBitRange(BitRange &BR, unsigned BitSize,
+                                            unsigned Align) {
+  BitRange NewBR;
+  NewBR.Width = PowerOf2Ceil(BR.Width);
+  Type *NewTy = Type::getIntNTy(*Context, NewBR.Width);
+  Type *StoreTy = Type::getIntNTy(*Context, PowerOf2Ceil(BitSize));
+
+  // Check if we can find a new Shift for the Width of NewBR, so that
+  // NewBR forms a new range covering the old modified range without
+  // worsening alignment.
+  auto coverOldRange = [&](BitRange &NewBR, BitRange &BR) -> bool {
+    unsigned MAlign = MinAlign(Align, DL->getABITypeAlignment(NewTy));
+    int Shift = BR.Shift - BR.Shift % (MAlign * 8);
+    while (Shift >= 0) {
+      if (NewBR.Width + Shift <= BitSize &&
+          NewBR.Width + Shift >= BR.Width + BR.Shift) {
+        NewBR.Shift = Shift;
+        return true;
+      }
+      Shift -= MAlign * 8;
+    }
+    return false;
+  };
+  // See whether we can store NewTy legally.
+  auto isStoreLegalType = [&](Type *NewTy) -> bool {
+    EVT OldEVT = TLI->getValueType(*DL, StoreTy);
+    EVT NewEVT = TLI->getValueType(*DL, NewTy);
+    return TLI->isOperationLegalOrCustom(ISD::STORE, NewEVT) ||
+           TLI->isTruncStoreLegalOrCustom(OldEVT, NewEVT);
+  };
+
+  // Try to find the minimal NewBR.Width which can form a legal type and cover
+  // all the old modified bits.
+  while (NewBR.Width < BitSize &&
+         (!isStoreLegalType(NewTy) ||
+          TLI->isNarrowingExpensive(EVT::getEVT(StoreTy), EVT::getEVT(NewTy)) ||
+          !coverOldRange(NewBR, BR))) {
+    NewBR.Width = NextPowerOf2(NewBR.Width);
+    NewTy = Type::getIntNTy(*Context, NewBR.Width);
+  }
+  BR.Width = NewBR.Width;
+  BR.Shift = NewBR.Shift;
+
+  if (BR.Width >= BitSize)
+    return false;
+  return true;
+}
+
+/// Compute the offset used to compute the new ptr address. It will be
+/// mainly based on BR and the endian of the target.
+static unsigned computeBitOffset(BitRange &BR, unsigned BitSize,
+                                 const DataLayout &DL) {
+  unsigned BitOffset;
+  if (DL.isBigEndian())
+    BitOffset = BitSize - BR.Shift - BR.Width;
+  else
+    BitOffset = BR.Shift;
+
+  return BitOffset;
+}
+
+static unsigned computeByteOffset(BitRange &BR, unsigned BitSize,
+                                  const DataLayout &DL) {
+  return computeBitOffset(BR, BitSize, DL) / 8;
+}
+
+/// Check whether V1 and V2 has the same ptr value by looking through bitcast.
+static bool hasSamePtr(Value *V1, Value *V2) {
+  Value *NV1 = nullptr;
+  Value *NV2 = nullptr;
+  if (V1 == V2)
+    return true;
+  if (BitCastInst *BC1 = dyn_cast<BitCastInst>(V1))
+    NV1 = BC1->getOperand(0);
+  if (BitCastInst *BC2 = dyn_cast<BitCastInst>(V2))
+    NV2 = BC2->getOperand(0);
+  if (!NV1 && !NV2)
+    return false;
+  if (V1 == NV2 || V2 == NV1 || NV1 == NV2)
+    return true;
+  return false;
+}
+
+/// Check whether there is any clobber instruction between \p From and \p To
+/// for the memory location accessed by \p To.
+bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From,
+                                               Instruction &To) {
+  assert(MSSA->getMemoryAccess(&To) && "Expect To has valid MemoryAccess");
+  MemoryAccess *FromAccess = MSSA->getMemoryAccess(&From);
+  assert(FromAccess && "Expect From has valid MemoryAccess");
+  MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(&To);
+  if (FromAccess != DefiningAccess &&
+      MSSA->dominates(FromAccess, DefiningAccess))
+    return true;
+  return false;
+}
+
+/// It is possible that we already have a store of LargeVal and it is not
+/// clobbered, then we can use it and don't have to generate a a new store.
+bool MemAccessShrinkingPass::needNewStore(Value &LargeVal, StoreInst &SI) {
+  for (User *U : LargeVal.users()) {
+    StoreInst *PrevSI = dyn_cast<StoreInst>(U);
+    if (!PrevSI ||
+        !hasSamePtr(PrevSI->getPointerOperand(), SI.getPointerOperand()) ||
+        PrevSI->getValueOperand()->getType() != SI.getValueOperand()->getType())
+      continue;
+    if (!hasClobberBetween(*PrevSI, SI))
+      return false;
+  }
+  return true;
+}
+
+/// Create new address to be used by shrunk load/store based on original
+/// address \p Ptr, offset \p StOffset and the new type \p NewTy.
+Value *MemAccessShrinkingPass::createNewPtr(Value *Ptr, unsigned StOffset,
+                                            Type *NewTy, IRBuilder<> &Builder) {
+  Value *NewPtr = Ptr;
+  unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  if (StOffset) {
+    ConstantInt *Idx = ConstantInt::get(Type::getInt32Ty(*Context), StOffset);
+    NewPtr =
+        Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(*Context, AS), "cast");
+    NewPtr =
+        Builder.CreateGEP(Type::getInt8Ty(*Context), NewPtr, Idx, "uglygep");
+  }
+  return Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS), "cast");
+}
+
+/// Check if it is legal to shrink the store if the input LargeVal is a
+/// LoadInst.
+bool MemAccessShrinkingPass::isLegalToShrinkStore(LoadInst &LI, StoreInst &SI) {
+  Value *Ptr = SI.getOperand(1);
+  // LI should have the same address as SI.
+  if (!hasSamePtr(LI.getPointerOperand(), Ptr))
+    return false;
+  if (LI.isVolatile() || !LI.isUnordered())
+    return false;
+  // Make sure the memory location of LI/SI is not clobbered between them.
+  if (hasClobberBetween(LI, SI))
+    return false;
+  return true;
+}
+
+/// we try to handle is when a smaller value is "inserted" into some bits
+/// of a larger value. This typically looks something like:
+///
+///   store(or(and(LargeVal, MaskConstant), SmallVal)), address)
+///
+/// Here, we try to use a narrow store of `SmallVal` rather than bit
+/// operations to combine them:
+///
+///   store(LargeVal, address)
+///   store(SmallVal, address)
+///
+/// Or if `LargeVal` was a load, we may not need to store it at all:
+///
+///   store(SmallVal, address)
+///
+/// We may also be able incorporate shifts of `SmallVal` by using an offset
+/// of `address`.
+///
+/// However, this may not be valid if, for example, the size of `SmallVal`
+/// isn't a valid type to store or the shift can't be modeled as a valid
+/// offset from the address, then we can still try another transformation
+/// which is to shrink the store to smaller legal width when the original
+/// width of the store is illegal.
+bool MemAccessShrinkingPass::tryShrinkStoreBySplit(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Type *StoreTy = Val->getType();
+  if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy())
+    return false;
+  if (SI.isVolatile() || !SI.isUnordered())
+    return false;
+  unsigned BitSize = DL->getTypeSizeInBits(StoreTy);
+  if (BitSize != DL->getTypeStoreSizeInBits(StoreTy))
+    return false;
+
+  Value *LargeVal;
+  Value *SmallVal;
+  ConstantInt *Cst;
+  if (!(match(Val, m_Or(m_And(m_Value(LargeVal), m_ConstantInt(Cst)),
+                        m_Value(SmallVal))) &&
+        match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) &&
+      !(match(Val, m_Or(m_Value(SmallVal),
+                        m_And(m_Value(LargeVal), m_ConstantInt(Cst)))) &&
+        match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) &&
+      !match(Val, m_c_Or(m_And(m_Value(LargeVal), m_ConstantInt(Cst)),
+                         m_Value(SmallVal))))
+    return false;
+
+  LoadInst *LI = dyn_cast<LoadInst>(LargeVal);
+  if (LI && !isLegalToShrinkStore(*LI, SI))
+    return false;
+
+  // Return BR which indicates the range of the LargeVal that will actually
+  // be modified. If the pattern does the work to replace portion of LargeVal
+  // with bits from SmallVal, we can do the split store transformation.
+  // Otherwise, we can only try shrink the store to legal type transformation.
+  // ToSplitStore will be populated to reflect that.
+  bool ToSplitStore;
+  BitRange BR = analyzeOrAndPattern(*SmallVal, *Cst, BitSize, ToSplitStore);
+  assert(BR.Shift + BR.Width <= BitSize && "Unexpected BitRange!");
+  if (!BR.Width)
+    return false;
+
+  if (ToSplitStore) {
+    // Get the offset from Ptr for the shrunk store.
+    unsigned StoreBitOffset = computeBitOffset(BR, BitSize, *DL);
+    if (StoreBitOffset % 8 != 0)
+      ToSplitStore = false;
+
+    // If BR.Width is not the length of legal integer type, we cannot
+    // store SmallVal directly.
+    if (BR.Width != 8 && BR.Width != 16 && BR.Width != 32 && BR.Width != 64)
+      ToSplitStore = false;
+  }
+
+  if (!ToSplitStore) {
+    if (!LI)
+      return false;
+    if (!findBRWithLegalType(SI, BR))
+      return false;
+    StoreShrunkInfo SInfo(SI, LI, SmallVal, Cst, BR);
+    shrinkStoreToLegalType(SInfo);
+    return true;
+  }
+  IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width);
+  if (TLI->isNarrowingExpensive(EVT::getEVT(StoreTy), EVT::getEVT(NewTy)))
+    return false;
+
+  StoreShrunkInfo SInfo(SI, LargeVal, SmallVal, Cst, BR);
+  splitStoreIntoTwo(SInfo);
+  return true;
+}
+
+/// The pattern we try to shrink (which may also apply to code matching
+/// the tryShrinkStoreBySplit pattern when that transformation isn't valid)
+/// is to shrink the inputs of basic bit operations (or, and, xor) and then
+/// store the smaller width result. This is valid whenever we know that
+/// shrinking the inputs and operations doesn't change the result. For example,
+/// if the removed bits are known to be 0s, 1s, or undef, we may be able to
+/// avoid the bitwise computation on them. This is particularly useful when
+/// the type width is not a legal type, and when the inputs are loads and
+/// constants.
+bool MemAccessShrinkingPass::tryShrinkStoreToLegalTy(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Type *StoreTy = Val->getType();
+  if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy())
+    return false;
+  if (SI.isVolatile() || !SI.isUnordered())
+    return false;
+  unsigned BitSize = DL->getTypeSizeInBits(StoreTy);
+  if (BitSize != DL->getTypeStoreSizeInBits(StoreTy))
+    return false;
+
+  LoadInst *LI;
+  ConstantInt *Cst;
+  if (!match(Val, m_c_Or(m_Load(LI), m_ConstantInt(Cst))) &&
+      !match(Val, m_c_And(m_Load(LI), m_ConstantInt(Cst))) &&
+      !match(Val, m_c_Xor(m_Load(LI), m_ConstantInt(Cst))))
+    return false;
+
+  if (!isLegalToShrinkStore(*LI, SI))
+    return false;
+
+  // Find out the range of Val which is changed.
+  BitRange BR = analyzeBOpPattern(*Val, *Cst, BitSize);
+  assert(BR.Shift + BR.Width <= BitSize && "Unexpected BitRange!");
+  if (!BR.Width)
+    return false;
+  if (!findBRWithLegalType(SI, BR))
+    return false;
+
+  StoreShrunkInfo SInfo(SI, LI, nullptr, Cst, BR);
+  shrinkStoreToLegalType(SInfo);
+  return true;
+}
+
+/// Try to extend the existing BitRange \BR to legal integer width.
+bool MemAccessShrinkingPass::findBRWithLegalType(StoreInst &SI, BitRange &BR) {
+  Type *StoreTy = SI.getOperand(0)->getType();
+  unsigned Align = SI.getAlignment();
+  unsigned BitSize = DL->getTypeSizeInBits(StoreTy);
+  // Check whether the store is of illegal type. If it is not, don't bother.
+  if (!TLI || TLI->isOperationLegalOrCustom(ISD::STORE,
+                                            TLI->getValueType(*DL, StoreTy)))
+    return false;
+  // Try to find out a BR of legal type.
+  if (!extendBitRange(BR, BitSize, Align))
+    return false;
+  return true;
+}
+
+/// The transformation to split the store into two: one is for the LargeVal
+/// and one is for the SmallVal. The first store can be saved if the LargeVal
+/// is got from a load or there is an existing store for the LargeVal.
+bool MemAccessShrinkingPass::splitStoreIntoTwo(StoreShrunkInfo &SInfo) {
+  BitRange &BR = SInfo.BR;
+  StoreInst &SI = SInfo.SI;
+  IRBuilder<> Builder(*Context);
+  Builder.SetInsertPoint(&SI);
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  Type *StoreTy = SI.getOperand(0)->getType();
+  unsigned BitSize = DL->getTypeSizeInBits(StoreTy);
+  unsigned StoreByteOffset = computeByteOffset(BR, BitSize, *DL);
+  unsigned Align = SI.getAlignment();
+  if (StoreByteOffset)
+    Align = MinAlign(StoreByteOffset, Align);
+
+  IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width);
+  Value *NewPtr = createNewPtr(Ptr, StoreByteOffset, NewTy, Builder);
+
+  // Check if we need to split the original store and generate a new
+  // store for the LargeVal.
+  if (!isa<LoadInst>(SInfo.LargeVal) && needNewStore(*SInfo.LargeVal, SI)) {
+    StoreInst *NewSI = cast<StoreInst>(SI.clone());
+    NewSI->setOperand(0, SInfo.LargeVal);
+    NewSI->setOperand(1, Ptr);
+    Builder.Insert(NewSI);
+    DEBUG(dbgs() << "MemShrink: Insert" << *NewSI << " before" << SI << "\n");
+    // MemorySSA update for the new store.
+    MemoryDef *OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+    MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+    MemoryDef *NewMemAcc = cast<MemoryDef>(
+        MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc));
+    MSSAUpdater->insertDef(NewMemAcc, false);
+  }
+
+  // Create the New Value to store.
+  Value *SmallVal = nullptr;
+  if (auto MVCst = dyn_cast<ConstantInt>(SInfo.SmallVal)) {
+    APInt ModifiedCst = MVCst->getValue().lshr(BR.Shift).trunc(BR.Width);
+    SmallVal = ConstantInt::get(*Context, ModifiedCst);
+  } else {
+    Value *ShiftedVal =
+        BR.Shift ? Builder.CreateLShr(SInfo.SmallVal, BR.Shift, "lshr")
+                 : SInfo.SmallVal;
+    SmallVal = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc");
+  }
+
+  // Create the new store and remove the old one.
+  StoreInst *NewSI = cast<StoreInst>(SI.clone());
+  NewSI->setOperand(0, SmallVal);
+  NewSI->setOperand(1, NewPtr);
+  NewSI->setAlignment(Align);
+  Builder.Insert(NewSI);
+
+  DEBUG(dbgs() << "MemShrink: Replace" << SI << " with" << *NewSI << "\n");
+  // MemorySSA update for the shrunk store.
+  MemoryDef *OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+  MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+  MemoryAccess *NewMemAcc =
+      MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc);
+  OldMemAcc->replaceAllUsesWith(NewMemAcc);
+  MSSAUpdater->removeMemoryAccess(OldMemAcc);
+
+  markCandForDeletion(*cast<Instruction>(Val));
+  SI.eraseFromParent();
+  NumStoreShrunkBySplit++;
+  return true;
+}
+
+/// The transformation to shrink the value of the store to smaller width.
+bool MemAccessShrinkingPass::shrinkStoreToLegalType(StoreShrunkInfo &SInfo) {
+  BitRange &BR = SInfo.BR;
+  StoreInst &SI = SInfo.SI;
+  IRBuilder<> Builder(*Context);
+  Builder.SetInsertPoint(&SI);
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  Type *StoreTy = SI.getOperand(0)->getType();
+  unsigned BitSize = DL->getTypeSizeInBits(StoreTy);
+  unsigned StoreByteOffset = computeByteOffset(BR, BitSize, *DL);
+  unsigned Align = SI.getAlignment();
+  if (StoreByteOffset)
+    Align = MinAlign(StoreByteOffset, Align);
+
+  IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width);
+  Value *NewPtr = createNewPtr(Ptr, StoreByteOffset, NewTy, Builder);
+
+  LoadInst *NewLI = cast<LoadInst>(cast<LoadInst>(SInfo.LargeVal)->clone());
+  NewLI->mutateType(NewTy);
+  NewLI->setOperand(0, NewPtr);
+  NewLI->setAlignment(Align);
+  Builder.Insert(NewLI, "load.trunc");
+  // MemorySSA update for the shrunk load.
+  MemoryDef *OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+  MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+  MSSAUpdater->createMemoryAccessBefore(NewLI, Def, OldMemAcc);
+
+  // Create the SmallVal to store.
+  Value *SmallVal = nullptr;
+  APInt ModifiedCst = SInfo.Cst->getValue().lshr(BR.Shift).trunc(BR.Width);
+  ConstantInt *NewCst = ConstantInt::get(*Context, ModifiedCst);
+  if (SInfo.SmallVal) {
+    // Use SInfo.SmallVal to get the SmallVal.
+    Value *Trunc;
+    if (auto MVCst = dyn_cast<ConstantInt>(SInfo.SmallVal)) {
+      ModifiedCst = MVCst->getValue().lshr(BR.Shift).trunc(BR.Width);
+      Trunc = ConstantInt::get(*Context, ModifiedCst);
+    } else {
+      Value *ShiftedVal =
+          BR.Shift ? Builder.CreateLShr(SInfo.SmallVal, BR.Shift, "lshr")
+                   : SInfo.SmallVal;
+      Trunc = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc");
+    }
+    Value *NewAnd = Builder.CreateAnd(NewLI, NewCst, "and.trunc");
+    SmallVal = Builder.CreateOr(NewAnd, Trunc, "or.trunc");
+  } else {
+    BinaryOperator *BOP = cast<BinaryOperator>(Val);
+    switch (BOP->getOpcode()) {
+    default:
+      llvm_unreachable("BOP can only be And/Or/Xor");
+    case Instruction::And:
+      SmallVal = Builder.CreateAnd(NewLI, NewCst, "and.trunc");
+      break;
+    case Instruction::Or:
+      SmallVal = Builder.CreateOr(NewLI, NewCst, "or.trunc");
+      break;
+    case Instruction::Xor:
+      SmallVal = Builder.CreateXor(NewLI, NewCst, "xor.trunc");
+      break;
+    }
+  }
+
+  // Create the new store and remove the old one.
+  StoreInst *NewSI = cast<StoreInst>(SI.clone());
+  NewSI->setOperand(0, SmallVal);
+  NewSI->setOperand(1, NewPtr);
+  NewSI->setAlignment(Align);
+  Builder.Insert(NewSI);
+
+  DEBUG(dbgs() << "MemShrink: Replace" << SI << " with" << *NewSI << "\n");
+  // MemorySSA update for the shrunk store.
+  OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+  Def = OldMemAcc->getDefiningAccess();
+  MemoryAccess *NewMemAcc =
+      MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc);
+  OldMemAcc->replaceAllUsesWith(NewMemAcc);
+  MSSAUpdater->removeMemoryAccess(OldMemAcc);
+
+  markCandForDeletion(*cast<Instruction>(Val));
+  SI.eraseFromParent();
+  NumStoreShrunkToLegal++;
+  return true;
+}
+
+/// AI contains a consecutive run of 1. Check the range \p BR of \p AI are all
+/// 1s.
+static bool isRangeofAIAllOnes(BitRange &BR, APInt &AI) {
+  if (BR.Shift < AI.countTrailingZeros() ||
+      BR.Width + BR.Shift > (AI.getBitWidth() - AI.countLeadingZeros()))
+    return false;
+  return true;
+}
+
+bool BitRange::isDisjoint(BitRange &BR) {
+  return (Shift > BR.Shift + BR.Width - 1) || (BR.Shift > Shift + Width - 1);
+}
+
+/// Check if there is no instruction between \p From and \p To which may
+/// clobber the MemoryLocation \p Mem. However, if there are clobbers and
+/// all the clobber instructions between \p From and \p To are in the same
+/// block as \p To, We will set \p AllClobberInToBlock to true.
+bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From,
+                                               Instruction &To,
+                                               const MemoryLocation &Mem,
+                                               bool &AllClobberInToBlock) {
+  assert(DT->dominates(&From, &To) && "From doesn't dominate To");
+  const BasicBlock *FBB = From.getParent();
+  const BasicBlock *TBB = To.getParent();
+  // Collect BasicBlocks to scan between FBB and TBB into BBSet.
+  SmallPtrSet<const BasicBlock *, 4> BBSet;
+  SmallVector<const BasicBlock *, 4> Worklist;
+  BBSet.insert(FBB);
+  BBSet.insert(TBB);
+  Worklist.push_back(TBB);
+  do {
+    const BasicBlock *BB = Worklist.pop_back_val();
+    for (const BasicBlock *Pred : predecessors(BB)) {
+      if (!BBSet.count(Pred)) {
+        BBSet.insert(Pred);
+        Worklist.push_back(Pred);
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Check the DefsList inside of each BB.
+  bool hasClobber = false;
+  AllClobberInToBlock = true;
+  for (const BasicBlock *BB : BBSet) {
+    const MemorySSA::DefsList *DList = MSSA->getBlockDefs(BB);
+    if (!DList)
+      continue;
+
+    for (const MemoryAccess &MA : *DList) {
+      if (const MemoryDef *MD = dyn_cast<MemoryDef>(&MA)) {
+        Instruction *Inst = MD->getMemoryInst();
+        if ((FBB == Inst->getParent() && DT->dominates(Inst, &From)) ||
+            (TBB == Inst->getParent() && DT->dominates(&To, Inst)))
+          continue;
+        // Check whether MD is a clobber of Mem.
+        if (MSSAWalker->getClobberingMemoryAccess(const_cast<MemoryDef *>(MD),
+                                                  Mem) == MD) {
+          if (Inst->getParent() != TBB)
+            AllClobberInToBlock = false;
+          hasClobber = true;
+        }
+      }
+    }
+  }
+  if (!hasClobber)
+    AllClobberInToBlock = false;
+  return hasClobber;
+}
+
+/// Match \p Val to the pattern:
+/// Bop(...Bop(V, Cst_1), ..., Cst_N) and pattern:
+/// or(and(...or(and(LI, Cst_1), SmallVal_1), ..., Cse_N), SmallVal_N),
+/// and find the final load to be shrunk.
+/// All the Bop instructions in the first pattern and the final load will
+/// be added into \p Insts and will be shrunk afterwards.
+bool MemAccessShrinkingPass::matchInstsToShrink(
+    Value *Val, BitRange &BR, SmallVectorImpl<Instruction *> &Insts) {
+  unsigned BitSize = DL->getTypeSizeInBits(Val->getType());
+
+  // Match the pattern:
+  // Bop(...Bop(V, Cst_1), Cst_2, ..., Cst_N), Bop can be Add/Sub/And/Or/Xor.
+  // Those BinaryOperator instructions are easy to be shrunk to BR range.
+  Value *Opd;
+  ConstantInt *Cst;
+  while (match(Val, m_Add(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_Sub(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_And(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_Or(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_Xor(m_Value(Opd), m_ConstantInt(Cst)))) {
+    Instruction *BOP = cast<Instruction>(Val);
+    addSaveCandidate(*BOP);
+    Insts.push_back(BOP);
+    Val = Opd;
+  }
+
+  LoadInst *LI;
+  if ((LI = dyn_cast<LoadInst>(Val))) {
+    addSaveCandidate(*LI);
+    Insts.push_back(LI);
+    return true;
+  }
+
+  // Match the pattern:
+  // or(and(...or(and(LI, Cst_1), SmallVal_1), ..., Cse_N), SmallVal_N).
+  // Analyze the range of LargeVal which may be modified by the current
+  // or(and(LargeVal, Cst), SmallVal)) operations. If each or(and(...))
+  // pair modifies a BitRange which is disjoint with current BR, we can
+  // skip all these operations when we shrink the final load, so we only
+  // have to add the final load into Insts.
+  Value *LargeVal;
+  Value *SmallVal;
+  BitRange OtherBR;
+  while (match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)),
+                           m_Value(SmallVal))) ||
+         (match(Val, m_Or(m_And(m_Value(LargeVal), m_ConstantInt(Cst)),
+                          m_Value(SmallVal))) &&
+          match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) ||
+         (match(Val, m_Or(m_Value(SmallVal),
+                          m_And(m_Value(LargeVal), m_ConstantInt(Cst)))) &&
+          match(LargeVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value())))) {
+    addSaveCandidate(*cast<Instruction>(Val));
+    bool DoReplacement;
+    OtherBR.Shift = 0;
+    OtherBR.Width = BitSize;
+    // Analyze BitRange of current or(and(LargeVal, Cst), SmallVal) operations.
+    OtherBR = analyzeOrAndPattern(*SmallVal, *Cst, BitSize, DoReplacement);
+    // If OtherBR is not disjoint with BR, we cannot shrink load to its BR
+    // portion.
+    if (!BR.isDisjoint(OtherBR))
+      return false;
+    if (LI) {
+      Insts.push_back(LI);
+      return true;
+    }
+    Val = LargeVal;
+  }
+
+  return false;
+}
+
+/// Find the next MemoryAccess after LI in the same BB.
+MemoryAccess *findNextMemoryAccess(LoadInst &LI, MemorySSA &MSSA) {
+  for (auto Scan = LI.getIterator(); Scan != LI.getParent()->end(); ++Scan) {
+    if (MemoryAccess *MA = MSSA.getMemoryAccess(&*Scan))
+      return MA;
+  }
+  return nullptr;
+}
+
+/// Shrink \p Insts according to the range BR.
+Value *MemAccessShrinkingPass::shrinkInsts(
+    Value *NewPtr, BitRange &BR, SmallVectorImpl<Instruction *> &Insts,
+    bool AllClobberInToBlock, unsigned ResShift, IRBuilder<> &Builder) {
+  Value *NewVal;
+  IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width);
+  Instruction &InsertPt = *Builder.GetInsertPoint();
+  for (Instruction *I : make_range(Insts.rbegin(), Insts.rend())) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      unsigned BitSize = DL->getTypeSizeInBits(LI->getType());
+      unsigned LoadByteOffset = computeByteOffset(BR, BitSize, *DL);
+      unsigned Align = MinAlign(LoadByteOffset, LI->getAlignment());
+      Instruction &NewInsertPt = *(InsertPt.getParent()->getFirstInsertionPt());
+      if (AllClobberInToBlock && (&InsertPt != &NewInsertPt)) {
+        // If we use a new insertion point, we have to recreate the NewPtr in
+        // the new place.
+        Builder.SetInsertPoint(&NewInsertPt);
+        RecursivelyDeleteTriviallyDeadInstructions(NewPtr);
+        NewPtr = createNewPtr(LI->getPointerOperand(), LoadByteOffset, NewTy,
+                              Builder);
+      }
+      // Create shrunk load.
+      LoadInst *NewLI = cast<LoadInst>(LI->clone());
+      NewLI->setOperand(0, NewPtr);
+      NewLI->mutateType(NewTy);
+      NewLI->setAlignment(Align);
+      Builder.Insert(NewLI, "load.trunc");
+      NewVal = NewLI;
+      // Update MemorySSA.
+      MemoryUseOrDef *OldMemAcc =
+          cast<MemoryUseOrDef>(MSSA->getMemoryAccess(LI));
+      MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+      // Find a proper position to insert the new MemoryAccess.
+      MemoryAccess *Next = findNextMemoryAccess(*NewLI, *MSSA);
+      if (Next)
+        MSSAUpdater->createMemoryAccessBefore(NewLI, Def,
+                                              cast<MemoryUseOrDef>(Next));
+      else
+        MSSAUpdater->createMemoryAccessInBB(NewLI, Def, NewLI->getParent(),
+                                            MemorySSA::End);
+    } else {
+      // shrink Add/Sub/And/Xor/Or in Insts.
+      BinaryOperator *BOP = cast<BinaryOperator>(I);
+      ConstantInt *Cst = cast<ConstantInt>(BOP->getOperand(1));
+      APInt NewAPInt = Cst->getValue().extractBits(BR.Width, BR.Shift);
+      ConstantInt *NewCst =
+          ConstantInt::get(*Context, NewAPInt.zextOrTrunc(BR.Width));
+      NewVal = Builder.CreateBinOp(BOP->getOpcode(), NewVal, NewCst);
+    }
+  }
+  // Adjust the type to be consistent with the input instruction.
+  IntegerType *UseType = cast<IntegerType>(InsertPt.getType());
+  if (DL->getTypeSizeInBits(UseType) != BR.Width)
+    NewVal = Builder.CreateZExt(NewVal, UseType, "trunc.zext");
+  // Adjust the NewVal with ResShift.
+  if (ResShift) {
+    ConstantInt *NewCst = ConstantInt::get(UseType, ResShift);
+    NewVal = Builder.CreateShl(NewVal, NewCst, "shl");
+  }
+  return NewVal;
+}
+
+/// Determine whether \p Inst can be counted as a saved instruction when we
+/// are evaluate the benefit. If \p ReplaceAllUses is true, it means we are
+/// going to replace all the uses of \p Inst with the shrunk result, so it
+/// can still be counted as a saved instruction even if it has multiple uses.
+void MemAccessShrinkingPass::addSaveCandidate(Instruction &Inst,
+                                              bool ReplaceAllUses) {
+  if (!MultiUsesSeen) {
+    // If the Inst has multiple uses and the current shrinking cannot replace
+    // all its uses, it will not regarded as a SavedInst.
+    if (ReplaceAllUses || Inst.hasOneUse())
+      SavedInsts += (!isa<CastInst>(Inst) && !isa<TruncInst>(Inst));
+    else
+      MultiUsesSeen = true;
+  }
+}
+
+/// Compare \p SavedInsts with instructions we are about to create and decide
+/// whether it is beneficial to do the shrinking.
+bool MemAccessShrinkingPass::isBeneficial(
+    unsigned ResShift, SmallVectorImpl<Instruction *> &Insts) {
+  unsigned InstsToCreate = 0;
+  if (ResShift)
+    InstsToCreate++;
+  InstsToCreate += Insts.size();
+  return SavedInsts >= InstsToCreate;
+}
+
+/// When the input instruction \p IN is and(Val, Cst) or trunc, it indicates
+/// only a portion of its input value has been used. We will walk through the
+/// Def-Use chain, track the range of value which will be used, remember the
+/// operations contributing to the used value range, and skip operations which
+/// changes value range that is not to be used, until a load is found.
+///
+/// So we start from and or trunc operations, then try to find a sequence of
+/// shl or lshr operations. Those shifts are only changing the valid range
+/// of input value.
+///
+/// After we know the valid range of input value, we will collect the sequence
+/// of binary operators, which we want to shrink their input to the same range.
+/// To make it simpler, we requires one of the operands of any binary operator
+/// has to be constant.
+///
+/// Then we look for the pattern of or(and(LargeVal, Cst), SmallVal), which
+/// will only change a portion of the input LargeVal and keep the rest of it
+/// the same. If the operations pattern doesn't change the valid range, we can
+/// use LargeVal as input and skip all the operations here.
+///
+/// In the end, we look for a load which can provide the valid range directly
+/// after shifting the address and shrinking the size of the load.
+///
+/// An example:
+///   and(lshr(add(or(and(load P, -65536), SmallVal), 0x1000000000000), 48),
+///   255)
+///
+/// The actual valid range of the load is [48, 56). The value in the range is
+/// incremented by 1. The sequence above will be transformed to:
+///   zext(add(load_i8(P + 48), 1), 64)
+///
+bool MemAccessShrinkingPass::tryShrinkLoadUse(Instruction &IN) {
+  // If the instruction is actually dead, skip it.
+  if (IN.use_empty())
+    return false;
+
+  Type *Ty = IN.getType();
+  if (Ty->isVectorTy() || !Ty->isIntegerTy())
+    return false;
+
+  MultiUsesSeen = false;
+  SavedInsts = 0;
+
+  // Match and(Val, Cst) or Trunc(Val)
+  Value *Val;
+  ConstantInt *AndCst = nullptr;
+  if (!match(&IN, m_Trunc(m_Value(Val))) &&
+      !match(&IN, m_And(m_Value(Val), m_ConstantInt(AndCst))))
+    return false;
+  addSaveCandidate(IN, true);
+
+  // Get the initial BR showing the range of Val to be used.
+  BitRange BR;
+  BR.Shift = 0;
+  unsigned ResShift = 0;
+  unsigned BitSize = DL->getTypeSizeInBits(Val->getType());
+  if (BitSize % 8 != 0)
+    return false;
+  if (AndCst) {
+    APInt AI = AndCst->getValue();
+    // Check AI has consecutive one bits. The consecutive bits are the
+    // range to be used. If the num of trailing zeros are not zero,
+    // remember the num in ResShift and the val after shrinking needs
+    // to be shifted accordingly.
+    BR.Shift = AI.countTrailingZeros();
+    ResShift = BR.Shift;
+    if (!(AI.lshr(BR.Shift) + 1).isPowerOf2() || BR.Shift == BitSize)
+      return false;
+    BR.Width = AI.getActiveBits() - BR.Shift;
+  } else {
+    BR.Width = DL->getTypeSizeInBits(Ty);
+  }
+
+  // Match a series of LShr or Shl. Adjust BR.Shift accordingly.
+  // Note we have to be careful that valid bits may be cleared during
+  // back-and-force shifts. We use an all-one APInt Mask to simulate
+  // the shifts and track the valid bits.
+  Value *Opd;
+  unsigned NewShift = BR.Shift;
+  APInt Mask(BitSize, -1);
+  ConstantInt *Cst;
+  // Record the series of LShr or Shl in ShiftRecs.
+  SmallVector<std::pair<bool, unsigned>, 4> ShiftRecs;
+  bool isLShr;
+  while ((isLShr = match(Val, m_LShr(m_Value(Opd), m_ConstantInt(Cst)))) ||
+         match(Val, m_Shl(m_Value(Opd), m_ConstantInt(Cst)))) {
+    addSaveCandidate(*cast<Instruction>(Val));
+    NewShift = isLShr ? (NewShift + Cst->getZExtValue())
+                      : (NewShift - Cst->getZExtValue());
+    ShiftRecs.push_back({isLShr, Cst->getZExtValue()});
+    Val = Opd;
+  }
+  // Iterate ShiftRecs in reverse order. Simulate the shifts of Mask.
+  for (auto SR : make_range(ShiftRecs.rbegin(), ShiftRecs.rend())) {
+    if (SR.first)
+      Mask = Mask.lshr(SR.second);
+    else
+      Mask = Mask.shl(SR.second);
+  }
+  // Mask contains a consecutive run of 1s. If the range BR of Mask are all
+  // 1s, BR is valid after the shifts.
+  if (!isRangeofAIAllOnes(BR, Mask))
+    return false;
+  BR.Shift = NewShift;
+
+  // Specify the proper BR we want to handle.
+  if (BR.Shift + BR.Width > BitSize)
+    return false;
+  if (BR.Width != 8 && BR.Width != 16 && BR.Width != 32 && BR.Width != 64)
+    return false;
+  if (BR.Shift % 8 != 0)
+    return false;
+
+  // Match other Binary operators like Add/Sub/And/Xor/Or or pattern like
+  // And(Or(...And(Or(LargeVal, Cst), SmallVal))) and find the final load.
+  SmallVector<Instruction *, 8> Insts;
+  if (!matchInstsToShrink(Val, BR, Insts))
+    return false;
+
+  // Expect the final load has been found here.
+  assert(!Insts.empty() && "Expect Insts to be not empty");
+  LoadInst *LI = dyn_cast<LoadInst>(Insts.back());
+  assert(LI && "Last elem in Insts should be a LoadInst");
+  if (LI->isVolatile() || !LI->isUnordered())
+    return false;
+
+  IntegerType *NewTy = Type::getIntNTy(*Context, BR.Width);
+  if (TLI->isNarrowingExpensive(EVT::getEVT(Ty), EVT::getEVT(NewTy)))
+    return false;
+
+  // Do legality check to ensure the range of shrunk load is not clobbered
+  // from *LI to IN.
+  IRBuilder<> Builder(*Context);
+  Builder.SetInsertPoint(&IN);
+  unsigned LoadByteOffset = computeByteOffset(BR, BitSize, *DL);
+  Value *NewPtr =
+      createNewPtr(LI->getPointerOperand(), LoadByteOffset, NewTy, Builder);
+  // There are clobbers and all the clobbers are in the same block as IN.
+  bool AllClobberInToBlock;
+  if (hasClobberBetween(*LI, IN, MemoryLocation(NewPtr, BR.Width / 8),
+                        AllClobberInToBlock)) {
+    if (!AllClobberInToBlock) {
+      RecursivelyDeleteTriviallyDeadInstructions(NewPtr);
+      return false;
+    }
+  }
+  if (!isBeneficial(ResShift, Insts)) {
+    RecursivelyDeleteTriviallyDeadInstructions(NewPtr);
+    return false;
+  }
+
+  // Do the shrinking transformation.
+  Value *NewVal =
+      shrinkInsts(NewPtr, BR, Insts, AllClobberInToBlock, ResShift, Builder);
+  DEBUG(dbgs() << "MemShrink: Replace" << IN << " with" << *NewVal << "\n");
+  IN.replaceAllUsesWith(NewVal);
+  markCandForDeletion(IN);
+  NumLoadShrunk++;
+  return true;
+}
+
+/// Check insts in CandidatesToErase. If they are dead, remove the dead
+/// instructions recursively and clean up Memory SSA.
+bool MemAccessShrinkingPass::removeDeadInsts() {
+  bool Changed = false;
+
+  for (Instruction *I : CandidatesToErase) {
+    RecursivelyDeleteTriviallyDeadInstructions(I, TLInfo, MSSAUpdater.get());
+    Changed = true;
+  }
+  CandidatesToErase.clear();
+  return Changed;
+}
+
+/// Try memory access shrinking on Function \Fn.
+bool MemAccessShrinkingPass::tryShrink(Function &Fn) {
+  bool MadeChange = false;
+  for (BasicBlock *BB : post_order(&Fn)) {
+    for (auto InstI = BB->rbegin(), InstE = BB->rend(); InstI != InstE;) {
+      Instruction &Inst = *InstI++;
+      if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
+        if (tryShrinkStoreBySplit(*SI)) {
+          MadeChange = true;
+          continue;
+        }
+        MadeChange |= tryShrinkStoreToLegalTy(*SI);
+        continue;
+      }
+      if (isa<TruncInst>(&Inst) || isa<BinaryOperator>(&Inst))
+        MadeChange |= tryShrinkLoadUse(Inst);
+    }
+  }
+  return MadeChange;
+}
+
+bool MemAccessShrinkingPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
+    return false;
+
+  DL = &Fn.getParent()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSAWalker = MSSA->getWalker();
+  MSSAUpdater = make_unique<MemorySSAUpdater>(MSSA);
+  CandidatesToErase.clear();
+
+  Context = &Fn.getContext();
+
+  if (TM)
+    TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+
+  // Iterates the Fn until nothing is changed.
+  MSSA->verifyMemorySSA();
+  bool MadeChange = false;
+  while (true) {
+    if (!tryShrink(Fn) && !removeDeadInsts())
+      break;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
Index: lib/CodeGen/TargetPassConfig.cpp
===================================================================
--- lib/CodeGen/TargetPassConfig.cpp
+++ lib/CodeGen/TargetPassConfig.cpp
@@ -68,6 +68,9 @@
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
     cl::desc("Disable Machine Sinking"));
+static cl::opt<bool>
+    DisableMemAccessShrinking("disable-memaccess-shrinking", cl::Hidden,
+                              cl::desc("Disable MemAccessShrinking"));
 static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
     cl::desc("Disable Loop Strength Reduction Pass"));
 static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
@@ -482,6 +485,9 @@
   if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
     addPass(createConstantHoistingPass());
 
+  if (getOptLevel() != CodeGenOpt::None && !DisableMemAccessShrinking)
+    addPass(createMemAccessShrinkingPass(TM));
+
   if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
     addPass(createPartiallyInlineLibCallsPass());
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -141,6 +141,7 @@
   bool isZExtFree(SDValue Val, EVT VT2) const override;
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+  bool isNarrowingExpensive(EVT VT1, EVT VT2) const override;
 
   MVT getVectorIdxTy(const DataLayout &) const override;
   bool isSelectSupported(SelectSupportKind) const override;
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -761,6 +761,21 @@
   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
 }
 
+bool AMDGPUTargetLowering::isNarrowingExpensive(EVT SrcVT, EVT DestVT) const {
+  // If we are reducing to a 32-bit load, this is always better.
+  if (DestVT.getStoreSizeInBits() == 32)
+    return true;
+
+  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+  // extloads, so doing one requires using a buffer_load. In cases where we
+  // still couldn't use a scalar load, using the wider load shouldn't really
+  // hurt anything.
+
+  // If the old size already had to be an extload, there's no harm in continuing
+  // to reduce the width.
+  return (SrcVT.getStoreSizeInBits() < 32);
+}
+
 //===---------------------------------------------------------------------===//
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//
Index: lib/Transforms/Utils/Local.cpp
===================================================================
--- lib/Transforms/Utils/Local.cpp
+++ lib/Transforms/Utils/Local.cpp
@@ -364,9 +364,8 @@
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
-bool
-llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
-                                                 const TargetLibraryInfo *TLI) {
+bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
+    Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAUpdater) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
     return false;
@@ -393,6 +392,11 @@
           DeadInsts.push_back(OpI);
     }
 
+    // If MemorySSA is used, update it when we are about to erase a memory
+    // access instruction.
+    if (MSSAUpdater)
+      MSSAUpdater->removeMemoryAccess(I);
+
     I->eraseFromParent();
   } while (!DeadInsts.empty());
 
Index: test/CodeGen/ARM/illegal-bitfield-loadstore.ll
===================================================================
--- test/CodeGen/ARM/illegal-bitfield-loadstore.ll
+++ test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -12,13 +12,9 @@
 ;
 ; BE-LABEL: i24_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r1, [r0]
-; BE-NEXT:    ldrb r2, [r0, #2]
-; BE-NEXT:    orr r1, r2, r1, lsl #8
+; BE-NEXT:    ldrh r1, [r0, #1]
 ; BE-NEXT:    orr r1, r1, #384
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    strh r1, [r0, #1]
 ; BE-NEXT:    mov pc, lr
   %aa = load i24, i24* %a, align 1
   %b = or i24 %aa, 384
@@ -39,11 +35,12 @@
 ;
 ; BE-LABEL: i24_and_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, #128
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    ldrh r1, [r0]
-; BE-NEXT:    orr r1, r1, #1
-; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    ldrh r1, [r0, #1]
+; BE-NEXT:    mov r2, #16256
+; BE-NEXT:    orr r2, r2, #49152
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    and r1, r1, r2
+; BE-NEXT:    strh r1, [r0, #1]
 ; BE-NEXT:    mov pc, lr
   %b = load i24, i24* %a, align 1
   %c = and i24 %b, -128
@@ -55,23 +52,18 @@
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; LE-LABEL: i24_insert_bit:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldrh r2, [r0]
-; LE-NEXT:    mov r3, #255
-; LE-NEXT:    orr r3, r3, #57088
-; LE-NEXT:    and r2, r2, r3
-; LE-NEXT:    orr r1, r2, r1, lsl #13
-; LE-NEXT:    strh r1, [r0]
+; LE-NEXT:    ldrb r2, [r0, #1]
+; LE-NEXT:    and r2, r2, #223
+; LE-NEXT:    orr r1, r2, r1, lsl #5
+; LE-NEXT:    strb r1, [r0, #1]
 ; LE-NEXT:    mov pc, lr
 ;
 ; BE-LABEL: i24_insert_bit:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r2, [r0]
-; BE-NEXT:    mov r3, #57088
-; BE-NEXT:    orr r3, r3, #16711680
-; BE-NEXT:    and r2, r3, r2, lsl #8
-; BE-NEXT:    orr r1, r2, r1, lsl #13
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    ldrb r2, [r0, #1]
+; BE-NEXT:    and r2, r2, #223
+; BE-NEXT:    orr r1, r2, r1, lsl #5
+; BE-NEXT:    strb r1, [r0, #1]
 ; BE-NEXT:    mov pc, lr
   %extbit = zext i1 %bit to i24
   %b = load i24, i24* %a, align 1
@@ -92,19 +84,9 @@
 ;
 ; BE-LABEL: i56_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, r0
-; BE-NEXT:    ldr r12, [r0]
-; BE-NEXT:    ldrh r2, [r1, #4]!
-; BE-NEXT:    ldrb r3, [r1, #2]
-; BE-NEXT:    orr r2, r3, r2, lsl #8
-; BE-NEXT:    orr r2, r2, r12, lsl #24
-; BE-NEXT:    orr r2, r2, #384
-; BE-NEXT:    lsr r3, r2, #8
-; BE-NEXT:    strb r2, [r1, #2]
-; BE-NEXT:    strh r3, [r1]
-; BE-NEXT:    bic r1, r12, #255
-; BE-NEXT:    orr r1, r1, r2, lsr #24
-; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    ldr r1, [r0, #3]
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    str r1, [r0, #3]
 ; BE-NEXT:    mov pc, lr
   %aa = load i56, i56* %a
   %b = or i56 %aa, 384
@@ -123,19 +105,10 @@
 ;
 ; BE-LABEL: i56_and_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, r0
-; BE-NEXT:    mov r3, #128
-; BE-NEXT:    ldrh r2, [r1, #4]!
-; BE-NEXT:    strb r3, [r1, #2]
-; BE-NEXT:    lsl r2, r2, #8
-; BE-NEXT:    ldr r12, [r0]
-; BE-NEXT:    orr r2, r2, r12, lsl #24
-; BE-NEXT:    orr r2, r2, #384
-; BE-NEXT:    lsr r3, r2, #8
-; BE-NEXT:    strh r3, [r1]
-; BE-NEXT:    bic r1, r12, #255
-; BE-NEXT:    orr r1, r1, r2, lsr #24
-; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    ldr r1, [r0, #3]
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    bic r1, r1, #127
+; BE-NEXT:    str r1, [r0, #3]
 ; BE-NEXT:    mov pc, lr
 
   %b = load i56, i56* %a, align 1
@@ -148,30 +121,18 @@
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; LE-LABEL: i56_insert_bit:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldr r2, [r0]
-; LE-NEXT:    bic r2, r2, #8192
-; LE-NEXT:    orr r1, r2, r1, lsl #13
-; LE-NEXT:    str r1, [r0]
+; LE-NEXT:    ldr r2, [r0, #1]
+; LE-NEXT:    bic r2, r2, #32
+; LE-NEXT:    orr r1, r2, r1, lsl #5
+; LE-NEXT:    str r1, [r0, #1]
 ; LE-NEXT:    mov pc, lr
 ;
 ; BE-LABEL: i56_insert_bit:
 ; BE:       @ BB#0:
-; BE-NEXT:    .save {r11, lr}
-; BE-NEXT:    push {r11, lr}
-; BE-NEXT:    mov r2, r0
-; BE-NEXT:    ldr lr, [r0]
-; BE-NEXT:    ldrh r12, [r2, #4]!
-; BE-NEXT:    ldrb r3, [r2, #2]
-; BE-NEXT:    orr r12, r3, r12, lsl #8
-; BE-NEXT:    orr r3, r12, lr, lsl #24
-; BE-NEXT:    bic r3, r3, #8192
-; BE-NEXT:    orr r1, r3, r1, lsl #13
-; BE-NEXT:    lsr r3, r1, #8
-; BE-NEXT:    strh r3, [r2]
-; BE-NEXT:    bic r2, lr, #255
-; BE-NEXT:    orr r1, r2, r1, lsr #24
-; BE-NEXT:    str r1, [r0]
-; BE-NEXT:    pop {r11, lr}
+; BE-NEXT:    ldr r2, [r0, #2]
+; BE-NEXT:    bic r2, r2, #32
+; BE-NEXT:    orr r1, r2, r1, lsl #5
+; BE-NEXT:    str r1, [r0, #2]
 ; BE-NEXT:    mov pc, lr
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1
Index: test/CodeGen/ARM/load-shrink.ll
===================================================================
--- test/CodeGen/ARM/load-shrink.ll
+++ test/CodeGen/ARM/load-shrink.ll
@@ -0,0 +1,111 @@
+; RUN: opt < %s -mtriple=arm-eabi -memaccessshrink -S | FileCheck %s
+; Check bitfield load is shrinked properly in cases below.
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+
+; The bitfield store can be shrinked from i64 to i16.
+; CHECK-LABEL: @load_and(
+; CHECK: %cast = bitcast i64* %ptr to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 8
+; CHECK: %trunc.zext = zext i16 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = and i64 %bf.load, 65535
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_trunc(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %load.trunc = load i8, i8* %cast, align 8
+; CHECK: %cmp = icmp eq i8 %load.trunc, 1
+
+define i1 @load_trunc(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = trunc i64 %bf.load to i8
+  %cmp = icmp eq i8 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %trunc.zext = zext i8 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.lshr = lshr i64 %bf.load, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr_add(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr_add(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %add = add i64 %bf.load, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_ops(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_ops(i64* %ptr, i64 %value) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.value = and i64 %value, 65535
+  %bf.clear1 = and i64 %bf.load, -65536
+  %bf.set = or i64 %bf.value, %bf.clear1
+  %add = add i64 %bf.set, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; It doesn't worth to do the shrink because %bf.load has multiple uses
+; and the shrink here doesn't save instructions.
+; CHECK-LABEL: @load_trunc_multiuses
+; CHECK: %bf.load = load i64, i64* %ptr, align 8
+; CHECK: %bf.trunc = trunc i64 %bf.load to i16
+; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3
+; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000
+; CHECK: %cmp = and i1 %cmp1, %cmp2
+
+define i1 @load_trunc_multiuses(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.trunc = trunc i64 %bf.load to i16
+  %cmp1 = icmp eq i16 %bf.trunc, 3
+  %cmp2 = icmp ult i64 %bf.load, 1500000
+  %cmp = and i1 %cmp1, %cmp2
+  ret i1 %cmp
+}
+
Index: test/CodeGen/ARM/store-shrink.ll
===================================================================
--- test/CodeGen/ARM/store-shrink.ll
+++ test/CodeGen/ARM/store-shrink.ll
@@ -0,0 +1,366 @@
+; RUN: opt < %s -mtriple=arm-eabi -memaccessshrink -S | FileCheck %s
+; Check bitfield store is shrinked properly in cases below.
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %t0 = trunc i64 %conv to i16
+; CHECK: %bf.value = and i16 %t0, 255
+; CHECK: %trunc = trunc i16 %bf.value to i8
+; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %t0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %t0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16
+; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %bf.value = and i64 %conv, 4294967295
+; CHECK: %trunc = trunc i64 %bf.value to i32
+; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: %t0 = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %t0, 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: %bf.value = and i32 %n, 65535
+; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK: if.end:
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16 
+; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %bf.shl = shl i32 %bf.value, 16
+; CHECK: %lshr = lshr i32 %bf.shl, 16
+; CHECK: %trunc = trunc i32 %lshr to i16
+; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1 
+;
+define void @i24_or(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i8.
+; CHECK-LABEL: @i24_and(
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -7
+; CHECK: store i8 %and.trunc, i8* %uglygep, align 1
+;
+define void @i24_and(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = and i24 %aa, -1537
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_xor(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %xor.trunc = xor i16 %load.trunc, 384
+; CHECK: store i16 %xor.trunc, i16* %cast, align 1
+;
+define void @i24_xor(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = xor i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_and_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i24_and_or(i24* %a) {
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i24 store to i8.
+; CHECK-LABEL: @i24_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i24
+; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %lshr = lshr i24 %extbit.shl, 8
+; CHECK: %trunc = trunc i24 %lshr to i8
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { 
+  %extbit = zext i1 %bit to i24 
+  %b = load i24, i24* %a, align 1 
+  %extbit.shl = shl nuw nsw i24 %extbit, 13 
+  %c = and i24 %b, -8193 
+  %d = or i24 %c, %extbit.shl 
+  store i24 %d, i24* %a, align 1 
+  ret void 
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or(
+; CHECK: %cast = bitcast i56* %a to i32*
+; CHECK: %load.trunc = load i32, i32* %cast, align 1
+; CHECK: %or.trunc = or i32 %load.trunc, 384
+; CHECK: store i32 %or.trunc, i32* %cast, align 1
+;
+define void @i56_or(i56* %a) {
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_and_or(
+; CHECK: %cast = bitcast i56* %a to i32*
+; CHECK: %load.trunc = load i32, i32* %cast, align 1
+; CHECK: %and.trunc = and i32 %load.trunc, -128
+; CHECK: %or.trunc = or i32 %and.trunc, 384
+; CHECK: store i32 %or.trunc, i32* %cast, align 1
+;
+define void @i56_and_or(i56* %a) {
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store to i8.
+; CHECK-LABEL: @i56_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i56
+; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %load.trunc = load i32, i32* %cast1, align 1
+; CHECK: %lshr = lshr i56 %extbit.shl, 8
+; CHECK: %trunc = trunc i56 %lshr to i32
+; CHECK: %and.trunc = and i32 %load.trunc, -33
+; CHECK: %or.trunc = or i32 %and.trunc, %trunc
+; CHECK: store i32 %or.trunc, i32* %cast1, align 1
+;
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or_alg2(
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %load.trunc = load i32, i32* %cast1, align 2
+; CHECK: %or.trunc = or i32 %load.trunc, 272
+; CHECK: store i32 %or.trunc, i32* %cast1, align 2
+;
+define void @i56_or_alg2(i56* %a) {
+  %aa = load i56, i56* %a, align 2
+  %b = or i56 %aa, 17825792
+  store i56 %b, i56* %a, align 2
+  ret void
+}
+
+
Index: test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
===================================================================
--- test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -13,9 +13,9 @@
 ; SOURCE-SCHED: xorl
 ; SOURCE-SCHED: cmpl
 ; SOURCE-SCHED: setg
-; SOURCE-SCHED: movb
 ; SOURCE-SCHED: xorl
 ; SOURCE-SCHED: subl
+; SOURCE-SCHED: movb
 ; SOURCE-SCHED: testb
 ; SOURCE-SCHED: jne
 	%0 = load i32, i32* @g_5, align 4		; <i32> [#uses=1]
Index: test/CodeGen/X86/constant-combines.ll
===================================================================
--- test/CodeGen/X86/constant-combines.ll
+++ test/CodeGen/X86/constant-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -disable-memaccess-shrinking < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
Index: test/CodeGen/X86/i16lshr8pat.ll
===================================================================
--- test/CodeGen/X86/i16lshr8pat.ll
+++ test/CodeGen/X86/i16lshr8pat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
+; RUN: llc -march=x86 -disable-memaccess-shrinking -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll
===================================================================
--- test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -4,13 +4,7 @@
 define void @i24_or(i24* %a) {
 ; CHECK-LABEL: i24_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    movb %cl, 2(%rdi)
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
-; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; CHECK-NEXT:    retq
   %aa = load i24, i24* %a, align 1
   %b = or i24 %aa, 384
@@ -22,13 +16,9 @@
 ; CHECK-LABEL: i24_and_or:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    movb %cl, 2(%rdi)
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
-; CHECK-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
-; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    orl $384, %eax # imm = 0x180
+; CHECK-NEXT:    andl $65408, %eax # imm = 0xFF80
+; CHECK-NEXT:    movw %ax, (%rdi)
 ; CHECK-NEXT:    retq
   %b = load i24, i24* %a, align 1
   %c = and i24 %b, -128
@@ -40,16 +30,11 @@
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i24_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl (%rdi), %ecx
-; CHECK-NEXT:    movzbl 2(%rdi), %edx
-; CHECK-NEXT:    movb %dl, 2(%rdi)
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shll $13, %eax
-; CHECK-NEXT:    andl $16769023, %edx # imm = 0xFFDFFF
-; CHECK-NEXT:    orl %eax, %edx
-; CHECK-NEXT:    movw %dx, (%rdi)
+; CHECK-NEXT:    movb 1(%rdi), %al
+; CHECK-NEXT:    shlb $5, %sil
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    movb %al, 1(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i24
   %b = load i24, i24* %a, align 1
@@ -63,19 +48,7 @@
 define void @i56_or(i56* %a) {
 ; CHECK-LABEL: i56_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl 4(%rdi), %eax
-; CHECK-NEXT:    movzbl 6(%rdi), %ecx
-; CHECK-NEXT:    movl (%rdi), %edx
-; CHECK-NEXT:    movb %cl, 6(%rdi)
-; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    orq $384, %rdx # imm = 0x180
-; CHECK-NEXT:    movl %edx, (%rdi)
-; CHECK-NEXT:    shrq $32, %rdx
-; CHECK-NEXT:    movw %dx, 4(%rdi)
+; CHECK-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; CHECK-NEXT:    retq
   %aa = load i56, i56* %a, align 1
   %b = or i56 %aa, 384
@@ -86,21 +59,10 @@
 define void @i56_and_or(i56* %a) {
 ; CHECK-LABEL: i56_and_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl 4(%rdi), %eax
-; CHECK-NEXT:    movzbl 6(%rdi), %ecx
-; CHECK-NEXT:    movl (%rdi), %edx
-; CHECK-NEXT:    movb %cl, 6(%rdi)
-; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    orq $384, %rdx # imm = 0x180
-; CHECK-NEXT:    movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
-; CHECK-NEXT:    andq %rdx, %rax
-; CHECK-NEXT:    movl %eax, (%rdi)
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movw %ax, 4(%rdi)
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    orl $384, %eax # imm = 0x180
+; CHECK-NEXT:    andl $65408, %eax # imm = 0xFF80
+; CHECK-NEXT:    movw %ax, (%rdi)
 ; CHECK-NEXT:    retq
   %b = load i56, i56* %a, align 1
   %c = and i56 %b, -128
@@ -112,23 +74,11 @@
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i56_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl 4(%rdi), %ecx
-; CHECK-NEXT:    movzbl 6(%rdi), %edx
-; CHECK-NEXT:    movl (%rdi), %esi
-; CHECK-NEXT:    movb %dl, 6(%rdi)
-; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shlq $32, %rdx
-; CHECK-NEXT:    orq %rdx, %rsi
-; CHECK-NEXT:    shlq $13, %rax
-; CHECK-NEXT:    movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT:    andq %rsi, %rcx
-; CHECK-NEXT:    orq %rax, %rcx
-; CHECK-NEXT:    movl %ecx, (%rdi)
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    movw %cx, 4(%rdi)
+; CHECK-NEXT:    movb 1(%rdi), %al
+; CHECK-NEXT:    shlb $5, %sil
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    movb %al, 1(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1
Index: test/CodeGen/X86/load-shrink.ll
===================================================================
--- test/CodeGen/X86/load-shrink.ll
+++ test/CodeGen/X86/load-shrink.ll
@@ -0,0 +1,111 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check bitfield load is shrinked properly in cases below.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; The bitfield store can be shrinked from i64 to i16.
+; CHECK-LABEL: @load_and(
+; CHECK: %cast = bitcast i64* %ptr to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 8
+; CHECK: %trunc.zext = zext i16 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = and i64 %bf.load, 65535
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_trunc(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %load.trunc = load i8, i8* %cast, align 8
+; CHECK: %cmp = icmp eq i8 %load.trunc, 1
+
+define i1 @load_trunc(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = trunc i64 %bf.load to i8
+  %cmp = icmp eq i8 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %trunc.zext = zext i8 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.lshr = lshr i64 %bf.load, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr_add(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr_add(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %add = add i64 %bf.load, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_ops(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_ops(i64* %ptr, i64 %value) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.value = and i64 %value, 65535
+  %bf.clear1 = and i64 %bf.load, -65536
+  %bf.set = or i64 %bf.value, %bf.clear1
+  %add = add i64 %bf.set, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; It doesn't worth to do the shrink because %bf.load has multiple uses
+; and the shrink here doesn't save instructions.
+; CHECK-LABEL: @load_trunc_multiuses
+; CHECK: %bf.load = load i64, i64* %ptr, align 8
+; CHECK: %bf.trunc = trunc i64 %bf.load to i16
+; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3
+; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000
+; CHECK: %cmp = and i1 %cmp1, %cmp2
+
+define i1 @load_trunc_multiuses(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.trunc = trunc i64 %bf.load to i16
+  %cmp1 = icmp eq i16 %bf.trunc, 3
+  %cmp2 = icmp ult i64 %bf.load, 1500000
+  %cmp = and i1 %cmp1, %cmp2
+  ret i1 %cmp
+}
+
Index: test/CodeGen/X86/load-slice.ll
===================================================================
--- test/CodeGen/X86/load-slice.ll
+++ test/CodeGen/X86/load-slice.ll
@@ -110,20 +110,23 @@
   ret i32 %res
 }
 
-; Check that we do not optimize overlapping slices.
 ;
-; The 64-bits should NOT have been split in as slices are overlapping.
+; The slices are overlapping, but it can still be split.
 ; First slice uses bytes numbered 0 to 3.
 ; Second slice uses bytes numbered 6 and 7.
 ; Third slice uses bytes numbered 4 to 7.
 ;
 ; STRESS-LABEL: t3:
-; STRESS: shrq $48
-; STRESS: shrq $32
+; STRESS: movzwl 6(%rdi,%rsi,8), %eax
+; STRESS-NEXT: addl (%rdi,%rsi,8), %eax
+; STRESS-NEXT: addl 4(%rdi,%rsi,8), %eax
 ;
 ; REGULAR-LABEL: t3:
-; REGULAR: shrq $48
-; REGULAR: shrq $32
+; REGULAR: movq	(%rdi,%rsi,8), %rcx
+; REGULAR-NEXT: movq %rcx, %rax
+; REGULAR-NEXT: shrq $48, %rax
+; REGULAR-NEXT: addl %ecx, %eax
+; REGULAR-NEXT: addl 4(%rdi,%rsi,8), %eax
 define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
   %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
   %bitcast = bitcast %class.Complex* %arrayidx to i64*
Index: test/CodeGen/X86/lsr-loop-exit-cond.ll
===================================================================
--- test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -2,14 +2,16 @@
 ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK-LABEL: t:
+; CHECK: [[LABEL:LBB.*]]:
 ; CHECK: movl (%r9,%rax,4), %e{{..}}
-; CHECK-NEXT: testq
-; CHECK-NEXT: jne
+; CHECK: testq
+; CHECK: jne [[LABEL]]
 
 ; ATOM-LABEL: t:
+; ATOM: [[LABEL:LBB.*]]:
 ; ATOM: movl (%r9,%r{{.+}},4), %e{{..}}
-; ATOM-NEXT: testq
-; ATOM-NEXT: jne
+; ATOM: testq
+; ATOM: jne [[LABEL]]
 
 @Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
 @Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
Index: test/CodeGen/X86/mem-access-shrink.ll
===================================================================
--- test/CodeGen/X86/mem-access-shrink.ll
+++ test/CodeGen/X86/mem-access-shrink.ll
@@ -0,0 +1,216 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check the memory accesses in the test are shrinked as expected.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%class.B = type { i56, [4 x i16], i64 }
+@i = local_unnamed_addr global i64 0, align 8
+
+define zeroext i1 @foo(%class.B* nocapture %this, i8* %p1, i64 %p2) local_unnamed_addr align 2 {
+; CHECK-LABEL: @foo(
+entry:
+; CHECK: entry:
+; CHECK: %t0 = bitcast %class.B* %this to i64*
+; CHECK: %[[CAST1:.*]] = bitcast i64* %t0 to i16*
+; CHECK: %[[LOAD1:.*]] = load i16, i16* %[[CAST1]], align 8
+; CHECK: %[[TRUNC1:.*]] = zext i16 %[[LOAD1]] to i64
+; CHECK: %cmp = icmp eq i64 %[[TRUNC1]], 1
+
+  %t0 = bitcast %class.B* %this to i64*
+  %bf.load = load i64, i64* %t0, align 8
+  %bf.clear = and i64 %bf.load, 65535
+  %cmp = icmp eq i64 %bf.clear, 1
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+; CHECK: if.end:
+; CHECK: %[[CAST2:.*]] = bitcast i64* %t0 to i16*
+; CHECK: %[[LTRUNC1:.*]] = load i16, i16* %[[CAST2]], align 8
+; CHECK: %[[ADD1:.*]] = add i16 %[[LTRUNC1]], -1
+; CHECK: %[[TRUNCZ1:.*]] = zext i16 %[[ADD1]] to i64
+; CHECK: %[[CAST3:.*]] = bitcast i64* %t0 to i16*
+; CHECK: %[[TRUNC2:.*]] = trunc i64 %[[TRUNCZ1]] to i16
+; CHECK: store i16 %[[TRUNC2]], i16* %[[CAST3]], align 8
+
+  %dec = add i64 %bf.load, 65535
+  %bf.value = and i64 %dec, 65535
+  %bf.clear5 = and i64 %bf.load, -65536
+  %bf.set = or i64 %bf.value, %bf.clear5
+  store i64 %bf.set, i64* %t0, align 8
+  %t1 = ptrtoint i8* %p1 to i64
+  %t2 = load i64, i64* @i, align 8
+  %cmp.i = icmp ult i64 %t2, 3
+  br i1 %cmp.i, label %if.then.i, label %if.else.i
+
+if.then.i:                                        ; preds = %if.end
+  %and.i = lshr i64 %t1, 3
+  %div.i = and i64 %and.i, 1023
+  br label %_ZNK1B5m_fn3EPv.exit
+
+if.else.i:                                        ; preds = %if.end
+  %first_page_.i = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2
+  %t3 = load i64, i64* %first_page_.i, align 8
+  %mul.i = shl i64 %t3, 13
+  %sub.i = sub i64 %t1, %mul.i
+  %div2.i = lshr i64 %sub.i, 2
+  br label %_ZNK1B5m_fn3EPv.exit
+
+_ZNK1B5m_fn3EPv.exit:                             ; preds = %if.then.i, %if.else.i
+; CHECK: _ZNK1B5m_fn3EPv.exit:
+; CHECK: %[[CAST4:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP1:.*]] = getelementptr i8, i8* %[[CAST4]], i32 6
+; CHECK: %[[LTRUNC2:.*]] = load i8, i8* %[[UGEP1]], align 2
+; CHECK: %[[TRUNCZ2:.*]] = zext i8 %[[LTRUNC2]] to i64
+; CHECK: %cmp8 = icmp eq i64 %[[TRUNCZ2]], 4
+
+  %j.0.i = phi i64 [ %div.i, %if.then.i ], [ %div2.i, %if.else.i ]
+  %conv.i = trunc i64 %j.0.i to i16
+  %bf.lshr = lshr i64 %bf.load, 48
+  %bf.clear7 = and i64 %bf.lshr, 255
+  %cmp8 = icmp eq i64 %bf.clear7, 4
+  br i1 %cmp8, label %if.else, label %if.then9
+
+if.then9:                                         ; preds = %_ZNK1B5m_fn3EPv.exit
+; CHECK: if.then9:
+; CHECK: %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %[[TRUNCZ2]]
+; CHECK: store i16 %conv.i, i16* %arrayidx, align 2
+; CHECK: %[[CAST5:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP2:.*]] = getelementptr i8, i8* %[[CAST5]], i32 6
+; CHECK: %[[CAST6:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP3:.*]] = getelementptr i8, i8* %[[CAST6]], i32 6
+; CHECK: %[[LTRUNC3:.*]] = load i8, i8* %[[UGEP3]], align 2
+; CHECK: %[[ADD2:.*]] = add i8 %[[LTRUNC3]], 1
+; CHECK: %[[AND1:.*]] = and i8 %[[ADD2]], -1
+; CHECK: store i8 %[[AND1]], i8* %[[UGEP2]], align 2
+
+  %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %bf.clear7
+  store i16 %conv.i, i16* %arrayidx, align 2
+  %inc79 = add i64 %bf.set, 281474976710656
+  %bf.shl = and i64 %inc79, 71776119061217280
+  %bf.clear18 = and i64 %bf.set, -71776119061217281
+  %bf.set19 = or i64 %bf.shl, %bf.clear18
+  store i64 %bf.set19, i64* %t0, align 8
+  br label %return
+
+if.else:                                          ; preds = %_ZNK1B5m_fn3EPv.exit
+; CHECK: if.else:
+; CHECK: %[[CAST7:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP4:.*]] = getelementptr i8, i8* %[[CAST7]], i32 4
+; CHECK: %[[CAST8:.*]] = bitcast i8* %[[UGEP4]] to i16*
+; CHECK: %[[LTRUNC4:.*]] = load i16, i16* %[[CAST8]], align 4
+; CHECK: %[[TRUNCZ3:.*]] = zext i16 %[[LTRUNC4]] to i64
+; CHECK: %cmp24 = icmp eq i64 %[[TRUNCZ3]], 1
+
+  %bf.lshr21 = lshr i64 %bf.load, 32
+  %bf.clear22 = and i64 %bf.lshr21, 65535
+  %cmp24 = icmp eq i64 %bf.clear22, 1
+  br i1 %cmp24, label %if.else55, label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %if.else
+; CHECK: land.lhs.true:
+; CHECK: %[[CAST9:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP5:.*]] = getelementptr i8, i8* %[[CAST9]], i32 2
+; CHECK: %[[CAST10:.*]] = bitcast i8* %[[UGEP5]] to i16*
+; CHECK: %[[LTRUNC5:.*]] = load i16, i16* %[[CAST10]], align 2
+; CHECK: %[[TRUNCZ4:.*]] = zext i16 %[[LTRUNC5]] to i64
+; CHECK: %cmp28 = icmp eq i64 %[[TRUNCZ4]], %sub
+
+  %bf.lshr26 = lshr i64 %bf.load, 16
+  %bf.clear27 = and i64 %bf.lshr26, 65535
+  %div = lshr i64 %p2, 1
+  %sub = add nsw i64 %div, -1
+  %cmp28 = icmp eq i64 %bf.clear27, %sub
+  br i1 %cmp28, label %if.else55, label %if.then29
+
+if.then29:                                        ; preds = %land.lhs.true
+  %cmp30 = icmp ult i64 %p2, 3
+  br i1 %cmp30, label %if.then31, label %if.else35
+
+if.then31:                                        ; preds = %if.then29
+; CHECK: if.then31:
+; CHECK: %mul = shl nuw nsw i64 %[[TRUNCZ3]], 3
+
+  %and = and i64 %t1, -8192
+  %mul = shl nuw nsw i64 %bf.clear22, 3
+  %add = add i64 %mul, %and
+  br label %if.end41
+
+if.else35:                                        ; preds = %if.then29
+; CHECK: if.else35:
+; CHECK: %[[CAST11:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP6:.*]] = getelementptr i8, i8* %[[CAST11]], i32 4
+; CHECK: %[[CAST12:.*]] = bitcast i8* %[[UGEP6]] to i16*
+; CHECK: %[[LTRUNC6:.*]] = load i16, i16* %[[CAST12]], align 4
+; CHECK: %[[TRUNCZ5:.*]] = zext i16 %[[LTRUNC6]] to i64
+; CHECK: %shl = shl i64 %[[TRUNCZ5]], 6
+
+  %first_page_.i80 = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2
+  %t4 = load i64, i64* %first_page_.i80, align 8
+  %mul.i81 = shl i64 %t4, 13
+  %conv.i82 = shl nuw nsw i64 %bf.lshr21, 6
+  %mul3.i = and i64 %conv.i82, 4194240
+  %add.i = add i64 %mul.i81, %mul3.i
+  br label %if.end41
+
+if.end41:                                         ; preds = %if.else35, %if.then31
+; CHECK: if.end41:
+; CHECK: %[[CAST17:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP9:.*]] = getelementptr i8, i8* %[[CAST17]], i32 2
+; CHECK: %[[CAST18:.*]] = bitcast i8* %[[UGEP9]] to i16*
+; CHECK: %[[LTRUNC8:.*]] = load i16, i16* %[[CAST18]], align 2
+; CHECK: %[[ADD4:.*]] = add i16 %[[LTRUNC8]], 1
+; CHECK: %[[TRUNC3:.*]] = zext i16 %[[ADD4]] to i64
+; CHECK: %[[CAST13:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP7:.*]] = getelementptr i8, i8* %[[CAST13]], i32 2
+; CHECK: %[[CAST14:.*]] = bitcast i8* %[[UGEP7]] to i16*
+; CHECK: %[[CAST15:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP8:.*]] = getelementptr i8, i8* %[[CAST15]], i32 2
+; CHECK: %[[CAST16:.*]] = bitcast i8* %[[UGEP8]] to i16*
+; CHECK: %[[LTRUNC7:.*]] = load i16, i16* %[[CAST16]], align 2
+; CHECK: %[[ADD3:.*]] = add i16 %[[LTRUNC7]], 1
+; CHECK: %[[AND2:.*]] = and i16 %[[ADD3]], -1
+; CHECK: store i16 %[[AND2]], i16* %[[CAST14]], align 2
+; CHECK: %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %[[TRUNC3]]
+
+  %add.i.sink = phi i64 [ %add.i, %if.else35 ], [ %add, %if.then31 ]
+  %t5 = inttoptr i64 %add.i.sink to i16*
+  %inc4578 = add i64 %bf.set, 65536
+  %bf.shl48 = and i64 %inc4578, 4294901760
+  %bf.clear49 = and i64 %bf.set, -4294901761
+  %bf.set50 = or i64 %bf.shl48, %bf.clear49
+  store i64 %bf.set50, i64* %t0, align 8
+  %bf.lshr52 = lshr i64 %inc4578, 16
+  %bf.clear53 = and i64 %bf.lshr52, 65535
+  %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %bf.clear53
+  store i16 %conv.i, i16* %arrayidx54, align 2
+  br label %return
+
+if.else55:                                        ; preds = %land.lhs.true, %if.else
+; CHECK: if.else55:
+; CHECK: %[[CAST19:.*]] = bitcast i64* %t0 to i8*
+; CHECK: %[[UGEP10:.*]] = getelementptr i8, i8* %[[CAST19]], i32 4
+; CHECK: %[[CAST20:.*]] = bitcast i8* %[[UGEP10]] to i16*
+; CHECK: %[[LTRUNC9:.*]] = load i16, i16* %[[CAST20]], align 4
+; CHECK: store i16 %[[LTRUNC9]], i16* %t6, align 2
+; CHECK: %[[UGEP11:.*]] = getelementptr i8, i8* %cast, i32 2
+; CHECK: %[[CAST21:.*]] = bitcast i8* %[[UGEP11]] to i32*
+; CHECK: %lshr = lshr i64 %bf.shl63, 16
+; CHECK: %[[TRUNC4:.*]] = trunc i64 %lshr to i32
+; CHECK: store i32 %[[TRUNC4]], i32* %[[CAST21]], align 2
+
+  %conv59 = trunc i64 %bf.lshr21 to i16
+  %t6 = bitcast i8* %p1 to i16*
+  store i16 %conv59, i16* %t6, align 2
+  %bf.load61 = load i64, i64* %t0, align 8
+  %conv60 = shl i64 %j.0.i, 32
+  %bf.shl63 = and i64 %conv60, 281470681743360
+  %bf.clear64 = and i64 %bf.load61, -281474976645121
+  %bf.clear67 = or i64 %bf.clear64, %bf.shl63
+  store i64 %bf.clear67, i64* %t0, align 8
+  br label %return
+
+return:                                           ; preds = %if.then9, %if.else55, %if.end41, %entry
+  %retval.0 = phi i1 [ false, %entry ], [ true, %if.end41 ], [ true, %if.else55 ], [ true, %if.then9 ]
+  ret i1 %retval.0
+}
+
Index: test/CodeGen/X86/store-shrink.ll
===================================================================
--- test/CodeGen/X86/store-shrink.ll
+++ test/CodeGen/X86/store-shrink.ll
@@ -0,0 +1,365 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check bitfield store is shrinked properly in cases below.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %t0 = trunc i64 %conv to i16
+; CHECK: %bf.value = and i16 %t0, 255
+; CHECK: %trunc = trunc i16 %bf.value to i8
+; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %t0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %t0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16
+; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %bf.value = and i64 %conv, 4294967295
+; CHECK: %trunc = trunc i64 %bf.value to i32
+; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: %t0 = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %t0, 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: %bf.value = and i32 %n, 65535
+; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK: if.end:
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16 
+; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %bf.shl = shl i32 %bf.value, 16
+; CHECK: %lshr = lshr i32 %bf.shl, 16
+; CHECK: %trunc = trunc i32 %lshr to i16
+; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1 
+;
+define void @i24_or(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i8.
+; CHECK-LABEL: @i24_and(
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -7
+; CHECK: store i8 %and.trunc, i8* %uglygep, align 1
+;
+define void @i24_and(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = and i24 %aa, -1537
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_xor(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %xor.trunc = xor i16 %load.trunc, 384
+; CHECK: store i16 %xor.trunc, i16* %cast, align 1
+;
+define void @i24_xor(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = xor i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_and_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i24_and_or(i24* %a) {
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i24 store to i8.
+; CHECK-LABEL: @i24_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i24
+; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %lshr = lshr i24 %extbit.shl, 8
+; CHECK: %trunc = trunc i24 %lshr to i8
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { 
+  %extbit = zext i1 %bit to i24 
+  %b = load i24, i24* %a, align 1 
+  %extbit.shl = shl nuw nsw i24 %extbit, 13 
+  %c = and i24 %b, -8193 
+  %d = or i24 %c, %extbit.shl 
+  store i24 %d, i24* %a, align 1 
+  ret void 
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or(
+; CHECK: %cast = bitcast i56* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i56_or(i56* %a) {
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_and_or(
+; CHECK: %cast = bitcast i56* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i56_and_or(i56* %a) {
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store to i8.
+; CHECK-LABEL: @i56_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i56
+; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %lshr = lshr i56 %extbit.shl, 8
+; CHECK: %trunc = trunc i56 %lshr to i8
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or_alg2(
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i16*
+; CHECK: %load.trunc = load i16, i16* %cast1, align 2
+; CHECK: %or.trunc = or i16 %load.trunc, 272
+; CHECK: store i16 %or.trunc, i16* %cast1, align 2
+;
+define void @i56_or_alg2(i56* %a) {
+  %aa = load i56, i56* %a, align 2
+  %b = or i56 %aa, 17825792
+  store i56 %b, i56* %a, align 2
+  ret void
+}
+
+
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -385,6 +385,7 @@
   initializeTarget(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
+  initializeMemAccessShrinkingPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeRewriteSymbolsLegacyPassPass(Registry);