Index: include/llvm/CodeGen/Passes.h
===================================================================
--- include/llvm/CodeGen/Passes.h
+++ include/llvm/CodeGen/Passes.h
@@ -64,6 +64,9 @@
   MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag,
                                                       bool AbortOnFailedISel);
 
+  /// This pass shrinks some load/store to narrower legal type accesses.
+  FunctionPass *createMemAccessShrinkingPass(const TargetMachine *TM = nullptr);
+
   /// createCodeGenPreparePass - Transform the code to expose more pattern
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
Index: include/llvm/IR/PatternMatch.h
===================================================================
--- include/llvm/IR/PatternMatch.h
+++ include/llvm/IR/PatternMatch.h
@@ -317,6 +317,9 @@
 /// \brief Match a ConstantInt, capturing the value if we match.
 inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 
+/// \brief Match a load instruction, capturing the value if we match.
+inline bind_ty<LoadInst> m_Load(LoadInst *&LI) { return LI; }
+
 /// \brief Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -243,6 +243,7 @@
 void initializeMachineSinkingPass(PassRegistry&);
 void initializeMachineTraceMetricsPass(PassRegistry&);
 void initializeMachineVerifierPassPass(PassRegistry&);
+void initializeMemAccessShrinkingPassPass(PassRegistry&);
 void initializeMemCpyOptLegacyPassPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
 void initializeMemDerefPrinterPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -166,6 +166,7 @@
       (void) llvm::createCountingFunctionInserterPass();
       (void) llvm::createEarlyCSEPass();
       (void) llvm::createGVNHoistPass();
+      (void) llvm::createMemAccessShrinkingPass();
       (void) llvm::createMergedLoadStoreMotionPass();
       (void) llvm::createGVNPass();
       (void) llvm::createNewGVNPass();
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1900,6 +1900,14 @@
     return false;
   }
 
+  /// Return true if it's expensive to narrow operations of type VT1 to
+  /// VT2. Sometimes we want to try load/store shrinking to save extra
+  /// bit manipulation operations. We need to make sure the shrinking will
+  /// not be so expensive to offset the benefit.
+  virtual bool isNarrowingExpensive(EVT /*VT1*/, EVT /*VT2*/) const {
+    return true;
+  }
+
   /// \brief Return true if it is beneficial to convert a load of a constant to
   /// just the constant itself.
   /// On some targets it might be more efficient to use a combination of
Index: lib/CodeGen/CMakeLists.txt
===================================================================
--- lib/CodeGen/CMakeLists.txt
+++ lib/CodeGen/CMakeLists.txt
@@ -84,6 +84,7 @@
   MachineSSAUpdater.cpp
   MachineTraceMetrics.cpp
   MachineVerifier.cpp
+  MemAccessShrinking.cpp
   PatchableFunction.cpp
   MIRPrinter.cpp
   MIRPrintingPass.cpp
Index: lib/CodeGen/CodeGen.cpp
===================================================================
--- lib/CodeGen/CodeGen.cpp
+++ lib/CodeGen/CodeGen.cpp
@@ -48,6 +48,7 @@
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCSEPass(Registry);
+  initializeMemAccessShrinkingPassPass(Registry);
   initializeImplicitNullChecksPass(Registry);
   initializeMachineCombinerPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
Index: lib/CodeGen/MemAccessShrinking.cpp
===================================================================
--- lib/CodeGen/MemAccessShrinking.cpp
+++ lib/CodeGen/MemAccessShrinking.cpp
@@ -0,0 +1,1065 @@
+//===- MemAccessShrinking.cpp - Shrink the type of mem accesses -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// The pass tries to find opportunities where we can save bit manipulation
+/// operations or convert illegal type operations to legal type by shrinking
+/// the type of memory accesses.
+///
+/// A major motivation of the optimization:
+/// Consecutive bitfields are now wrapped as a group and represented as a
+/// large integer. To access a specific bitfield, wide load/store plus a
+/// series of bit operations are needed. However, if the bitfield is of legal
+/// type, it is more efficient to access it directly. Another case is the
+/// bitfield group may be of illegal integer type, if we only access one
+/// bitfield in the group, we may have chance to shrink the illegal type
+/// to a smaller legal type, and save some bit mask operations.
+/// Although the major motivation of the optimization is about bitfield
+/// accesses, it is not limited to them and can be applied to general memory
+/// accesses with similar patterns.
+///
+/// List some typical patterns and the way to transform them, in order to
+/// give an idea how the optimization works. More general patterns to recognize
+/// can be found in function head comments.
+///
+///    P1: store(or(and(load P, Cst), MaskedVal), P)
+/// If the bit operations show the range (width+shift, shift] of MaskedVal
+/// is extracted and filled into the target memory. It will be transformed
+/// to:
+///    narrow_store(trunc(lshr(MaskedVal, shift) to width), P+shift)
+/// Note: It is often the case that the pattern "trunc(lshr(MaskedVal, ...)"
+/// can be further simplified during DAG Combiner, which will increase the
+/// benefit.
+///
+///    P2: and(lshr(load P, Cst1), Cst2)
+/// If the bit operations show the range (width+shift, shift] of load P is
+/// the actual valid bits to be used. It will be transformed to:
+///    narrow_load(P+shift) with its type shrinked to 'width' bits integer.
+///
+/// Some notes about algorithm:
+/// * The algorithm scans the program and tries to recognize and transform the
+///   patterns above. It runs iteratively until no more change has happened.
+/// * To prevent the optimization from blocking load/store coalescing, it is
+///   invoked late in the pipeline, just before CodeGenPrepare. In this late
+///   stage, both the pattern matching and related safety check become more
+///   difficult because of previous optimizations introducing mergd load/store
+///   and more complex control flow. That is why MemorySSA is used here. It is
+///   scalable and precise for the safety check especially when we tries to
+///   insert a shrinked load in the block which is many blocks away from its
+///   original load.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Transforms/Utils/MemorySSAUpdater.h"
+
+#define DEBUG_TYPE "memaccessshrink"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumStoreShrinked, "Number of stores shrinked");
+STATISTIC(NumLoadShrinked, "Number of Loads shrinked");
+
+static cl::opt<bool> EnableLoadShrink("enable-load-shrink", cl::init(true));
+static cl::opt<bool> EnableStoreShrink("enable-store-shrink", cl::init(true));
+
+namespace {
+/// Describe the value range used for mem access.
+struct ModRange {
+  unsigned Shift;
+  unsigned Width;
+  bool ShrinkWithMaskedVal;
+};
+
+/// \brief The memory access shrinking pass.
+class MemAccessShrinkingPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MemAccessShrinkingPass(const TargetMachine *TM = nullptr) : FunctionPass(ID) {
+    this->TM = TM;
+    initializeMemAccessShrinkingPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  StringRef getPassName() const override { return "MemAccess Shrinking"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+private:
+  const DataLayout *DL = nullptr;
+  const DominatorTree *DT = nullptr;
+  const TargetMachine *TM = nullptr;
+  const TargetLowering *TLI = nullptr;
+  const TargetLibraryInfo *TLInfo = nullptr;
+  LLVMContext *Context;
+
+  MemorySSA *MSSA;
+  MemorySSAWalker *MSSAWalker;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+
+  // Candidate instructions to be erased if they have no other uses.
+  SmallPtrSet<Instruction *, 8> CandidatesToErase;
+
+  // MultiUsesSeen shows a multiuse node is found on the du-chain.
+  bool MultiUsesSeen;
+  // SavedInsts shows how many instructions saved for a specific shrink so far.
+  // It is used to evaluate whether some instructions will be saved for the
+  // shrinking especially when MultiUsesSeen is true.
+  unsigned SavedInsts;
+
+  void analyzeOrAndPattern(Value &MaskedVal, ConstantInt &Cst, ModRange &MR,
+                           unsigned TBits);
+  void analyzeBOpPattern(Value &Val, ConstantInt &Cst, ModRange &MR,
+                         unsigned TBits);
+  bool updateModRange(ModRange &MR, unsigned TBits, unsigned Align);
+  bool hasClobberBetween(Instruction &From, Instruction &To);
+  bool needNewStore(Value &OrigVal, StoreInst &SI);
+  Value *createNewPtr(Value *Ptr, unsigned StOffset, Type *NewTy,
+                      IRBuilder<> &Builder);
+
+  bool reduceLoadOpsStoreWidth(StoreInst &SI);
+  bool tryShrinkOnInst(Instruction &Inst);
+  bool reduceLoadOpsWidth(Instruction &IN);
+  Value *shrinkInsts(Value *NewPtr, ModRange &MR,
+                     SmallVectorImpl<Instruction *> &Insts,
+                     Instruction *NewInsertPt, unsigned ResShift,
+                     IRBuilder<> &Builder);
+  bool matchInstsToShrink(Value *Val, ModRange &MR,
+                          SmallVectorImpl<Instruction *> &Insts);
+  bool hasClobberBetween(Instruction &From, Instruction &To,
+                         const MemoryLocation &Mem, Instruction *&NewInsertPt);
+  void addSavedInst(Instruction &Inst, bool ReplaceAllUses = false);
+  bool isBeneficial(unsigned ResShift, SmallVectorImpl<Instruction *> &Insts);
+  void markCandForDeletion(Instruction &I) { CandidatesToErase.insert(&I); }
+  bool removeDeadInsts();
+};
+} // namespace
+
+char MemAccessShrinkingPass::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(MemAccessShrinkingPass, "memaccessshrink",
+                         "MemAccess Shrinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_TM_PASS_END(MemAccessShrinkingPass, "memaccessshrink",
+                       "MemAccess Shrinking", false, false)
+
+FunctionPass *llvm::createMemAccessShrinkingPass(const TargetMachine *TM) {
+  return new MemAccessShrinkingPass(TM);
+}
+
+/// Analyze or ((and (load P), \p Cst), \p MaskedVal). Update \p MR.Width
+/// with the number of bits of the original load to be modified, and update
+/// \p MR.Shift with the pos of the first bit to be modified. If the
+/// analysis result shows we can use bits extracted from MaskedVal as store
+/// value, set \p MR.ShrinkWithMaskedVal to be true.
+void MemAccessShrinkingPass::analyzeOrAndPattern(Value &MaskedVal,
+                                                 ConstantInt &Cst, ModRange &MR,
+                                                 unsigned TBits) {
+  // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We
+  // will handle patterns like either 0..01..1 or 1..10..01..1
+  APInt Mask = Cst.getValue().sextOrTrunc(TBits);
+  assert(Mask.getBitWidth() == TBits && "Unexpected bitwidth of Mask");
+  unsigned MaskLeadOnes = Mask.countLeadingOnes();
+  if (MaskLeadOnes == TBits)
+    return;
+  unsigned MaskTrailOnes = Mask.countTrailingOnes();
+  unsigned MaskMidZeros = !MaskLeadOnes
+                              ? Mask.countLeadingZeros()
+                              : Mask.ashr(MaskTrailOnes).countTrailingZeros();
+
+  MR.ShrinkWithMaskedVal = true;
+  // See if we have a continuous run of zeros.
+  if (MaskLeadOnes + MaskMidZeros + MaskTrailOnes != TBits) {
+    MaskMidZeros = TBits - MaskLeadOnes - MaskTrailOnes;
+    MR.ShrinkWithMaskedVal = false;
+  }
+
+  // Check MaskedVal only provides nonzero bits within range from lowbits
+  // (MaskTrailOnes) to highbits (MaskTrailOnes + MaskMidZeros).
+  APInt BitMask =
+      ~APInt::getBitsSet(TBits, MaskTrailOnes, MaskTrailOnes + MaskMidZeros);
+
+  // Find out the range in which 1 appears in MaskedVal.
+  APInt KnownOne(TBits, 0), KnownZero(TBits, 0);
+  computeKnownBits(&MaskedVal, KnownZero, KnownOne, *DL, 0);
+
+  MR.Shift = MaskTrailOnes;
+  MR.Width = MaskMidZeros;
+  // Expect the bits being 1 in BitMask are all KnownZero bits in MaskedVal,
+  // otherwise we need to set ShrinkWithMaskedVal to false and expand MR.
+  if ((KnownZero & BitMask) != BitMask) {
+    MR.ShrinkWithMaskedVal = false;
+    // Lower is the first bit for which we are not sure about the fact of
+    // its being zero.
+    unsigned Lower = KnownZero.countTrailingOnes();
+    // Higher is the last bit for which we are not sure about the fact of
+    // its being zero.
+    unsigned Higher = TBits - KnownZero.countLeadingOnes();
+    MR.Shift = std::min(Lower, MaskTrailOnes);
+    MR.Width = std::max(Higher, MaskTrailOnes + MaskMidZeros) - MR.Shift;
+  }
+}
+
+/// Analyze \p Val = or/xor/and ((load P), \p Cst). Update \p MR.Width
+/// with the number of bits of the original load to be modified, and update
+/// \p MR.Shift with the pos of the first bit to be modified.
+void MemAccessShrinkingPass::analyzeBOpPattern(Value &Val, ConstantInt &Cst,
+                                               ModRange &MR, unsigned TBits) {
+  APInt Mask = Cst.getValue().sextOrTrunc(TBits);
+  BinaryOperator *BOP = cast<BinaryOperator>(&Val);
+  if (BOP->getOpcode() == Instruction::And)
+    Mask = ~Mask;
+
+  MR.Shift = Mask.countTrailingZeros();
+  MR.Width = Mask.getBitWidth() - MR.Shift;
+  if (MR.Width)
+    MR.Width = MR.Width - Mask.countLeadingZeros();
+  MR.ShrinkWithMaskedVal = false;
+}
+
+/// Update \p MR.Width and \p MR.Shift so the updated \p MR.Width
+/// bits can form a legal type and also cover all the modified bits.
+bool MemAccessShrinkingPass::updateModRange(ModRange &MR, unsigned TBits,
+                                            unsigned Align) {
+  ModRange NewMR;
+  NewMR.Width = PowerOf2Ceil(MR.Width);
+  Type *NewTy = Type::getIntNTy(*Context, NewMR.Width);
+
+  // Check if we can find a new Shift for the Width of NewMR, so that
+  // NewMR forms a new range covering the old modified range without
+  // worsening alignment.
+  auto coverOldRange = [&](ModRange &NewMR, ModRange &MR) -> bool {
+    unsigned MAlign = MinAlign(Align, DL->getABITypeAlignment(NewTy));
+    int Shift = MR.Shift - MR.Shift % (MAlign * 8);
+    while (Shift >= 0) {
+      if (NewMR.Width + Shift <= TBits &&
+          NewMR.Width + Shift >= MR.Width + MR.Shift) {
+        NewMR.Shift = Shift;
+        return true;
+      }
+      Shift -= MAlign * 8;
+    }
+    return false;
+  };
+  // See whether we can store NewTy legally.
+  auto isStoreLegalType = [&](Type *NewTy) -> bool {
+    EVT OldEVT =
+        TLI->getValueType(*DL, Type::getIntNTy(*Context, PowerOf2Ceil(TBits)));
+    EVT NewEVT = TLI->getValueType(*DL, NewTy);
+    return TLI->isOperationLegalOrCustom(ISD::STORE, NewEVT) ||
+           TLI->isTruncStoreLegalOrCustom(OldEVT, NewEVT);
+  };
+  // Try to find the minimal NewMR.Width which can form a legal type and cover
+  // all the old modified bits.
+  while (NewMR.Width < TBits &&
+         (!isStoreLegalType(NewTy) || !coverOldRange(NewMR, MR))) {
+    NewMR.Width = NextPowerOf2(NewMR.Width);
+    NewTy = Type::getIntNTy(*Context, NewMR.Width);
+  }
+  MR.Width = NewMR.Width;
+  MR.Shift = NewMR.Shift;
+
+  if (MR.Width >= TBits)
+    return false;
+  return true;
+}
+
+/// Compute the offset used to compute the new ptr address. It will be
+/// mainly based on MR and the endian of the target.
+static unsigned computeStOffset(ModRange &MR, unsigned TBits,
+                                const DataLayout &DL) {
+  unsigned StOffset;
+  if (DL.isBigEndian())
+    StOffset = TBits - MR.Shift - MR.Width;
+  else
+    StOffset = MR.Shift;
+
+  if (StOffset % 8 != 0)
+    MR.ShrinkWithMaskedVal = false;
+
+  return StOffset / 8;
+}
+
+/// Check whether V1 and V2 has the same ptr value by looking through bitcast.
+static bool hasSamePtr(Value *V1, Value *V2) {
+  Value *NV1 = nullptr;
+  Value *NV2 = nullptr;
+  if (V1 == V2)
+    return true;
+  if (BitCastInst *BC1 = dyn_cast<BitCastInst>(V1))
+    NV1 = BC1->getOperand(0);
+  if (BitCastInst *BC2 = dyn_cast<BitCastInst>(V2))
+    NV2 = BC2->getOperand(0);
+  if (!NV1 && !NV2)
+    return false;
+  if (V1 == NV2 || V2 == NV1 || NV1 == NV2)
+    return true;
+  return false;
+}
+
+/// Check whether there is any clobber instruction between \p From and \p To
+/// for the memory location accessed by \p To.
+bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From,
+                                               Instruction &To) {
+  assert(MSSA->getMemoryAccess(&To) && "Expect To has valid MemoryAccess");
+  MemoryAccess *FromAccess = MSSA->getMemoryAccess(&From);
+  assert(FromAccess && "Expect From has valid MemoryAccess");
+  MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(&To);
+  if (FromAccess != DefiningAccess &&
+      MSSA->dominates(FromAccess, DefiningAccess))
+    return true;
+  return false;
+}
+
+/// It is possible that we already have a store of OrigVal previously and
+/// it is not clobbered, then we can use it and don't have to generate a
+/// a new store.
+bool MemAccessShrinkingPass::needNewStore(Value &OrigVal, StoreInst &SI) {
+  for (User *U : OrigVal.users()) {
+    StoreInst *PrevSI = dyn_cast<StoreInst>(U);
+    if (!PrevSI ||
+        !hasSamePtr(PrevSI->getPointerOperand(), SI.getPointerOperand()))
+      continue;
+    if (!hasClobberBetween(*PrevSI, SI))
+      return false;
+  }
+  return true;
+}
+
+/// Create new address to be used by shrinked load/store based on original
+/// address \p Ptr, offset \p StOffset and the new type \p NewTy.
+Value *MemAccessShrinkingPass::createNewPtr(Value *Ptr, unsigned StOffset,
+                                            Type *NewTy, IRBuilder<> &Builder) {
+  Value *NewPtr = Ptr;
+  unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  if (StOffset) {
+    ConstantInt *Idx = ConstantInt::get(Type::getInt32Ty(*Context), StOffset);
+    NewPtr =
+        Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(*Context, AS), "cast");
+    NewPtr =
+        Builder.CreateGEP(Type::getInt8Ty(*Context), NewPtr, Idx, "uglygep");
+  }
+  return Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS), "cast");
+}
+
+/// Try to shrink the store when the input \p SI has one of the patterns:
+/// Pattern1: or(and(Val, Cst), MaskedVal).
+/// Pattern2: or/and/xor(load, Cst).
+/// For the first pattern, when the Cst and MaskedVal satisfies some
+/// requirements, the or+and pair has the property that only a portion of
+/// Val is modified and the rest of it are not changed. We want to shrink
+/// the store to be aligned to the modified range of the Val.
+/// Pattern1 after the shrinking looks like:
+///
+///   store(Val)        // The store can be omitted if the Val is a load
+///                     // with the same address as the original store.
+///   narrow_store(shrinked MaskedVal)
+///
+/// However, if some condition doesn't satisfy, which will be indicated by
+/// MR::ShrinkWithMaskedVal being false, We may try another type of shrinking
+/// -- shrink the store to legal type if it is of illegal type. So for
+/// Pattern2 and Pattern1 when Val is load and MR::ShrinkWithMaskedVal is
+/// false, if the store is of illegal type, we may shrink them to:
+///
+///   store(or(and(shrinked load, shrinked Cst), shrinked MaskedVal)), or
+///   store(or/and/xor(shrinked load, shrinked Cst))
+///
+/// After the shrinking, all the operations will be of legal type.
+bool MemAccessShrinkingPass::reduceLoadOpsStoreWidth(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+  Type *StoreTy = Val->getType();
+  if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy())
+    return false;
+
+  if (SI.isVolatile() || !SI.isUnordered())
+    return false;
+
+  unsigned TBits = DL->getTypeSizeInBits(StoreTy);
+  if (TBits != DL->getTypeStoreSizeInBits(StoreTy))
+    return false;
+
+  LoadInst *LI = nullptr;
+  Value *OrigVal = nullptr;
+  Value *MaskedVal;
+  ConstantInt *Cst;
+  // Match or(and(Val, Cst), MaskedVal) pattern or their correspondents
+  // after commutation.
+  // However the pattern matching below is more complex than that because
+  // of the commutation and some matching preference we expect.
+  // for case: or(and(or(and(LI, Cst_1), MaskedVal_1), Cst_2), MaskedVal_2)
+  // and if MaskedVal_2 happens to be another and operator, we hope we can
+  // match or(and(LI, Cst_1), MaskedVal_1) to Val instead of MaskedVal.
+  bool OrAndPattern =
+      match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)),
+                        m_Value(MaskedVal))) ||
+      (match(Val, m_Or(m_And(m_Value(OrigVal), m_ConstantInt(Cst)),
+                       m_Value(MaskedVal))) &&
+       match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) ||
+      (match(Val, m_Or(m_Value(MaskedVal),
+                       m_And(m_Value(OrigVal), m_ConstantInt(Cst)))) &&
+       match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value())));
+  // Match "or/and/xor(load, cst)" pattern or their correspondents after
+  // commutation.
+  bool BopPattern = match(Val, m_c_Or(m_Load(LI), m_ConstantInt(Cst))) ||
+                    match(Val, m_c_And(m_Load(LI), m_ConstantInt(Cst))) ||
+                    match(Val, m_c_Xor(m_Load(LI), m_ConstantInt(Cst)));
+  if (!OrAndPattern && !BopPattern)
+    return false;
+
+  // LI should have the same address as SI.
+  if (LI && !hasSamePtr(LI->getPointerOperand(), Ptr))
+    return false;
+
+  if (LI && (LI->isVolatile() || !LI->isUnordered()))
+    return false;
+
+  // Make sure the memory location of LI/SI is not clobbered between them.
+  if (LI && hasClobberBetween(*LI, SI))
+    return false;
+
+  // Analyze MR which indicates the range of the input that will actually
+  // be modified and stored.
+  ModRange MR = {0, 0, true};
+  if (OrAndPattern)
+    analyzeOrAndPattern(*MaskedVal, *Cst, MR, TBits);
+  if (BopPattern)
+    analyzeBOpPattern(*Val, *Cst, MR, TBits);
+
+  assert(MR.Shift + MR.Width <= TBits && "Unexpected ModRange!");
+  if (!MR.Width)
+    return false;
+
+  unsigned StOffset;
+  if (MR.ShrinkWithMaskedVal) {
+    // Get the offset from Ptr for the shrinked store.
+    StOffset = computeStOffset(MR, TBits, *DL);
+
+    // If MR.Width is not the length of legal type, we cannot
+    // store MaskedVal directly.
+    if (MR.Width != 8 && MR.Width != 16 && MR.Width != 32)
+      MR.ShrinkWithMaskedVal = false;
+  }
+
+  unsigned Align = SI.getAlignment();
+  // If we are shrink illegal type of store with original val, update MR.Width
+  // and MR.Shift to ensure the shrinked store is of legal type.
+  if (!MR.ShrinkWithMaskedVal) {
+    // Check whether the store is of illegal type. If it is not, don't bother.
+    if (!TLI || TLI->isOperationLegalOrCustom(ISD::STORE,
+                                              TLI->getValueType(*DL, StoreTy)))
+      return false;
+    // Try to find out a MR of legal type.
+    if (!updateModRange(MR, TBits, Align))
+      return false;
+    // It is meaningless to shrink illegal type store for and(or(Val, ...)
+    // pattern if Val is not a load, because we still have to insert another
+    // illegal type store for Val.
+    if (OrAndPattern && !LI)
+      return false;
+
+    StOffset = computeStOffset(MR, TBits, *DL);
+  }
+  IntegerType *NewTy = Type::getIntNTy(*Context, MR.Width);
+  if (TLI->isNarrowingExpensive(EVT::getEVT(StoreTy), EVT::getEVT(NewTy)))
+    return false;
+
+  // Start shrinking the size of the store.
+  IRBuilder<> Builder(*Context);
+  Builder.SetInsertPoint(&SI);
+  Value *NewPtr = createNewPtr(Ptr, StOffset, NewTy, Builder);
+
+  if (StOffset)
+    Align = MinAlign(StOffset, Align);
+
+  LoadInst *NewLI;
+  if (MR.ShrinkWithMaskedVal) {
+    // Check if we need to split the original store and generate a new
+    // store for the OrigVal.
+    if (OrigVal && needNewStore(*OrigVal, SI)) {
+      StoreInst *NewSI = cast<StoreInst>(SI.clone());
+      NewSI->setOperand(0, OrigVal);
+      NewSI->setOperand(1, Ptr);
+      Builder.Insert(NewSI);
+      DEBUG(dbgs() << "MemShrink: Insert" << *NewSI << " before" << SI << "\n");
+      // MemorySSA update for the new store.
+      MemoryDef *OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+      MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+      MemoryDef *NewMemAcc = cast<MemoryDef>(
+          MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc));
+      MSSAUpdater->insertDef(NewMemAcc, false);
+    }
+  } else {
+    // MemorySSA update for the shrinked load.
+    NewLI = cast<LoadInst>(LI->clone());
+    NewLI->mutateType(NewTy);
+    NewLI->setOperand(0, NewPtr);
+    NewLI->setAlignment(Align);
+    Builder.Insert(NewLI, "load.trunc");
+    MemoryDef *OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+    MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+    MSSAUpdater->createMemoryAccessBefore(NewLI, Def, OldMemAcc);
+  }
+
+  // Create the New Value to store.
+  Value *NewVal = nullptr;
+  APInt ModifiedCst = Cst->getValue().lshr(MR.Shift).trunc(MR.Width);
+  ConstantInt *NewCst = ConstantInt::get(*Context, ModifiedCst);
+  if (OrAndPattern) {
+    // Shift and truncate MaskedVal.
+    Value *Trunc;
+    if (auto MVCst = dyn_cast<ConstantInt>(MaskedVal)) {
+      ModifiedCst = MVCst->getValue().lshr(MR.Shift).trunc(MR.Width);
+      Trunc = ConstantInt::get(*Context, ModifiedCst);
+    } else {
+      Value *ShiftedVal = MR.Shift
+                              ? Builder.CreateLShr(MaskedVal, MR.Shift, "lshr")
+                              : MaskedVal;
+      Trunc = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc");
+    }
+    if (MR.ShrinkWithMaskedVal) {
+      NewVal = Trunc;
+    } else {
+      Value *NewAnd = Builder.CreateAnd(NewLI, NewCst, "and.trunc");
+      NewVal = Builder.CreateOr(NewAnd, Trunc, "or.trunc");
+    }
+  } else {
+    BinaryOperator *BOP = cast<BinaryOperator>(Val);
+    switch (BOP->getOpcode()) {
+    default:
+      llvm_unreachable("BOP can only be And/Or/Xor");
+    case Instruction::And:
+      NewVal = Builder.CreateAnd(NewLI, NewCst, "and.trunc");
+      break;
+    case Instruction::Or:
+      NewVal = Builder.CreateOr(NewLI, NewCst, "or.trunc");
+      break;
+    case Instruction::Xor:
+      NewVal = Builder.CreateXor(NewLI, NewCst, "xor.trunc");
+      break;
+    }
+  }
+
+  // Create the new store and remove the old one.
+  StoreInst *NewSI = cast<StoreInst>(SI.clone());
+  NewSI->setOperand(0, NewVal);
+  NewSI->setOperand(1, NewPtr);
+  NewSI->setAlignment(Align);
+  Builder.Insert(NewSI);
+
+  DEBUG(dbgs() << "MemShrink: Replace" << SI << " with" << *NewSI << "\n");
+  // MemorySSA update for the shrinked store.
+  MemoryDef *OldMemAcc = cast<MemoryDef>(MSSA->getMemoryAccess(&SI));
+  MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+  MemoryAccess *NewMemAcc =
+      MSSAUpdater->createMemoryAccessBefore(NewSI, Def, OldMemAcc);
+  OldMemAcc->replaceAllUsesWith(NewMemAcc);
+  MSSAUpdater->removeMemoryAccess(OldMemAcc);
+
+  markCandForDeletion(*cast<Instruction>(Val));
+  SI.eraseFromParent();
+  NumStoreShrinked++;
+  return true;
+}
+
+/// The driver of shrinking and final dead instructions cleanup.
+bool MemAccessShrinkingPass::tryShrinkOnInst(Instruction &Inst) {
+  StoreInst *SI = dyn_cast<StoreInst>(&Inst);
+  if (EnableStoreShrink && SI)
+    return reduceLoadOpsStoreWidth(*SI);
+  if (EnableLoadShrink && (isa<TruncInst>(&Inst) || isa<BinaryOperator>(&Inst)))
+    return reduceLoadOpsWidth(Inst);
+  return false;
+}
+
+/// Check the range of Cst containing non-zero bit is within \p MR when
+/// \p AInB is true. If \p AInB is false, check MR is within the range
+/// of Cst.
+static bool CompareAPIntWithModRange(APInt &AI, ModRange &MR, bool AInB) {
+  unsigned LZ = AI.countLeadingZeros();
+  unsigned TZ = AI.countTrailingZeros();
+  unsigned TBits = AI.getBitWidth();
+  if (AInB && (TZ < MR.Shift || (TBits - LZ) > MR.Width + MR.Shift))
+    return false;
+
+  if (!AInB && (MR.Shift < TZ || MR.Width + MR.Shift > (TBits - LZ)))
+    return false;
+  return true;
+}
+
+/// Check if there is overlap between range \MR and \OtherMR.
+static bool nonoverlap(ModRange &MR, ModRange &OtherMR) {
+  return (MR.Shift > OtherMR.Shift + OtherMR.Width - 1) ||
+         (OtherMR.Shift > MR.Shift + MR.Width - 1);
+}
+
+/// Check there is no instruction between *LI and IN which may clobber
+/// the MemoryLocation specified by MR. We relax it a little bit to allow
+/// clobber instruction in the same BB as IN, if that happens we need to
+/// set the NewInsertPt to be the new location to insert the shrinked load.
+bool MemAccessShrinkingPass::hasClobberBetween(Instruction &From,
+                                               Instruction &To,
+                                               const MemoryLocation &Mem,
+                                               Instruction *&NewInsertPt) {
+  assert(DT->dominates(&From, &To) && "From doesn't dominate To");
+  const BasicBlock *FBB = From.getParent();
+  const BasicBlock *TBB = To.getParent();
+  // Collect BasicBlocks to scan between FBB and TBB into BBSet.
+  // Limit the maximum number of BasicBlocks to 3 to protect compile time.
+  SmallPtrSet<const BasicBlock *, 4> BBSet;
+  SmallVector<const BasicBlock *, 4> Worklist;
+  BBSet.insert(FBB);
+  BBSet.insert(TBB);
+  Worklist.push_back(TBB);
+  do {
+    const BasicBlock *BB = Worklist.pop_back_val();
+    for (const BasicBlock *Pred : predecessors(BB)) {
+      if (!BBSet.count(Pred)) {
+        BBSet.insert(Pred);
+        Worklist.push_back(Pred);
+      }
+    }
+  } while (!Worklist.empty());
+
+  for (const BasicBlock *BB : BBSet) {
+    const MemorySSA::DefsList *DList = MSSA->getBlockDefs(BB);
+    if (!DList)
+      continue;
+
+    for (const MemoryAccess &MA : *DList) {
+      if (const MemoryDef *MD = dyn_cast<MemoryDef>(&MA)) {
+        Instruction *Inst = MD->getMemoryInst();
+        if ((FBB == Inst->getParent() && DT->dominates(Inst, &From)) ||
+            (TBB == Inst->getParent() && DT->dominates(&To, Inst)))
+          continue;
+        if (MSSAWalker->getClobberingMemoryAccess(const_cast<MemoryDef *>(MD),
+                                                  Mem) == MD) {
+          if (TBB == Inst->getParent()) {
+            if (!NewInsertPt || DT->dominates(Inst, NewInsertPt))
+              NewInsertPt = Inst;
+            continue;
+          }
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// Match \p Val to the pattern:
+/// Bop(...Bop(V, Cst_1), Cst_2, ..., Cst_N) and pattern:
+/// or(and(...or(and(LI, Cst_1), MaskedVal_1), ..., Cse_N), MaskedVal_N),
+/// and find the final load to be shrinked.
+/// All the Bop instructions in the first pattern and the final load will
+/// be added into \p Insts and will be shrinked afterwards.
+bool MemAccessShrinkingPass::matchInstsToShrink(
+    Value *Val, ModRange &MR, SmallVectorImpl<Instruction *> &Insts) {
+  Value *Opd;
+  ConstantInt *Cst;
+  unsigned TBits = DL->getTypeSizeInBits(Val->getType());
+
+  // Match the pattern:
+  // Bop(...Bop(V, Cst_1), Cst_2, ..., Cst_N), Bop can be Add/Sub/And/Or/Xor.
+  // All these Csts cannot have one bits outside of range MR, so that we can
+  // shrink these Bops to be MR.Width bits integer type.
+  while (match(Val, m_Add(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_Sub(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_And(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_Or(m_Value(Opd), m_ConstantInt(Cst))) ||
+         match(Val, m_Xor(m_Value(Opd), m_ConstantInt(Cst)))) {
+    addSavedInst(*cast<Instruction>(Val));
+    APInt AI = Cst->getValue().zextOrTrunc(TBits);
+    if (!CompareAPIntWithModRange(AI, MR, true))
+      return false;
+    Insts.push_back(cast<Instruction>(Val));
+    Val = Opd;
+  }
+
+  LoadInst *LI;
+  if ((LI = dyn_cast<LoadInst>(Val))) {
+    addSavedInst(*LI);
+    Insts.push_back(LI);
+    return true;
+  }
+
+  // Match the pattern:
+  // or(and(...or(and(LI, Cst_1), MaskedVal_1), ..., Cse_N), MaskedVal_N)
+  // Analyze the ModRange of each or(and(...)) pair and see if it has
+  // any overlap with MR. If any overlap is found, we cannot do shrinking
+  // for the final Load LI.
+  Value *MaskedVal;
+  Value *OrigVal;
+  ModRange OtherMR;
+  while (match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)),
+                           m_Value(MaskedVal))) ||
+         (match(Val, m_Or(m_And(m_Value(OrigVal), m_ConstantInt(Cst)),
+                          m_Value(MaskedVal))) &&
+          match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value()))) ||
+         (match(Val, m_Or(m_Value(MaskedVal),
+                          m_And(m_Value(OrigVal), m_ConstantInt(Cst)))) &&
+          match(OrigVal, m_c_Or(m_And(m_Value(), m_Value()), m_Value())))) {
+    addSavedInst(*cast<Instruction>(Val));
+    OtherMR.Shift = 0;
+    OtherMR.Width = TBits;
+    // Analyze ModRange OtherMR of A from or(and(A, Cst), MaskedVal) pattern.
+    analyzeOrAndPattern(*MaskedVal, *Cst, OtherMR, TBits);
+    // If OtherMR has overlap with MR, we cannot shrink load to its MR portion.
+    if (!nonoverlap(MR, OtherMR))
+      return false;
+    if (LI) {
+      Insts.push_back(LI);
+      return true;
+    }
+    Val = OrigVal;
+  }
+
+  return false;
+}
+
+/// Find the next MemoryAccess after LI in the same BB.
+MemoryAccess *findNextMemoryAccess(LoadInst &LI, MemorySSA &MSSA) {
+  for (auto Scan = LI.getIterator(); Scan != LI.getParent()->end(); ++Scan) {
+    if (MemoryAccess *MA = MSSA.getMemoryAccess(&*Scan))
+      return MA;
+  }
+  return nullptr;
+}
+
+/// Shrink \p Insts according to the range MR.
+Value *MemAccessShrinkingPass::shrinkInsts(
+    Value *NewPtr, ModRange &MR, SmallVectorImpl<Instruction *> &Insts,
+    Instruction *NewInsertPt, unsigned ResShift, IRBuilder<> &Builder) {
+  Value *NewVal;
+  IntegerType *NewTy = Type::getIntNTy(*Context, MR.Width);
+  Instruction &InsertPt = *Builder.GetInsertPoint();
+  for (Instruction *I : make_range(Insts.rbegin(), Insts.rend())) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      unsigned TBits = DL->getTypeSizeInBits(LI->getType());
+      unsigned StOffset = computeStOffset(MR, TBits, *DL);
+      unsigned Align = MinAlign(StOffset, LI->getAlignment());
+      // If we use a new insertion point, we have to recreate the NewPtr in
+      // the new place.
+      if (NewInsertPt) {
+        Builder.SetInsertPoint(NewInsertPt);
+        RecursivelyDeleteTriviallyDeadInstructions(NewPtr);
+        NewPtr =
+            createNewPtr(LI->getPointerOperand(), StOffset, NewTy, Builder);
+      }
+      LoadInst *NewLI = cast<LoadInst>(LI->clone());
+      NewLI->setOperand(0, NewPtr);
+      NewLI->mutateType(NewTy);
+      NewLI->setAlignment(Align);
+      Builder.Insert(NewLI, "load.trunc");
+      NewVal = NewLI;
+      // Update MemorySSA.
+      MemoryUseOrDef *OldMemAcc =
+          cast<MemoryUseOrDef>(MSSA->getMemoryAccess(LI));
+      MemoryAccess *Def = OldMemAcc->getDefiningAccess();
+      // Find a proper position to insert the new MemoryAccess.
+      MemoryAccess *Next = findNextMemoryAccess(*NewLI, *MSSA);
+      if (Next)
+        MSSAUpdater->createMemoryAccessBefore(NewLI, Def,
+                                              cast<MemoryUseOrDef>(Next));
+      else
+        MSSAUpdater->createMemoryAccessInBB(NewLI, Def, NewLI->getParent(),
+                                            MemorySSA::End);
+    } else {
+      // shrink Add/Sub/And/Xor/Or in Insts.
+      BinaryOperator *BOP = cast<BinaryOperator>(I);
+      ConstantInt *Cst = cast<ConstantInt>(BOP->getOperand(1));
+      APInt NewAPInt = Cst->getValue().extractBits(MR.Width, MR.Shift);
+      ConstantInt *NewCst =
+          ConstantInt::get(*Context, NewAPInt.zextOrTrunc(MR.Width));
+      NewVal = Builder.CreateBinOp(BOP->getOpcode(), NewVal, NewCst);
+    }
+  }
+  // Adjust the type to be consistent with the use of input trunc/and
+  // instructions.
+  IntegerType *UseType = cast<IntegerType>(InsertPt.getType());
+  if (DL->getTypeSizeInBits(UseType) != MR.Width)
+    NewVal = Builder.CreateZExt(NewVal, UseType, "trunc.zext");
+  // Adjust the NewVal with ResShift.
+  if (ResShift) {
+    ConstantInt *NewCst = ConstantInt::get(UseType, ResShift);
+    NewVal = Builder.CreateShl(NewVal, NewCst, "shl");
+  }
+  return NewVal;
+}
+
+/// Determine whether \p Inst will be saved in the final shrinking and update
+/// MultiUsesSeen and SavedInsts accordingly. If \p ReplaceAllUses is
+/// true, it means we are going to replace all the uses of \p Inst with the
+/// shrinked result, so it doesn't matter even if it has multiple uses.
+void MemAccessShrinkingPass::addSavedInst(Instruction &Inst,
+                                          bool ReplaceAllUses) {
+  if (!MultiUsesSeen) {
+    // If the Inst has multiple uses and the current shrinking cannot replace
+    // all its uses, it will not regarded as a SavedInst.
+    if (ReplaceAllUses || Inst.hasOneUse())
+      SavedInsts += (!isa<CastInst>(Inst) && !isa<TruncInst>(Inst));
+    else
+      MultiUsesSeen = true;
+  }
+}
+
+/// Compare \p SavedInsts with instructions we are about to create and decide
+/// whether it is beneficial to do the shrinking.
+bool MemAccessShrinkingPass::isBeneficial(
+    unsigned ResShift, SmallVectorImpl<Instruction *> &Insts) {
+  unsigned InstsToCreate = 0;
+  if (ResShift)
+    InstsToCreate++;
+  InstsToCreate += Insts.size();
+  return SavedInsts >= InstsToCreate;
+}
+
+/// When the input instruction \p IN is and(Val, Cst) or trunc, it indicates
+/// only a portion of its input value has been used. We will walk through the
+/// ud chain, try to find out a final load and find out the range of the load
+/// really being used. That is the range we want to shrink the load to.
+/// The general pattern we are looking for is:
+///   and/trunc(sequence_of_shl_or_lshr(sequence_of_bops(
+///   sequence_of_or_and_pair(load P), ... )))
+/// An example:
+///   and(lshr(add(or(and(load P, -65536), MaskedVal), 0x1000000000000), 48),
+///   255)
+/// The actual modified range of load is [48, 56). The value in the range is
+/// incremented by 1. The sequence above will be transformed to:
+///   zext(add(load_i8(P + 48), 1), 64)
+bool MemAccessShrinkingPass::reduceLoadOpsWidth(Instruction &IN) {
+  // If the instruction is actually dead, skip it.
+  if (IN.use_empty())
+    return false;
+
+  Type *Ty = IN.getType();
+  if (Ty->isVectorTy() || !Ty->isIntegerTy())
+    return false;
+
+  MultiUsesSeen = false;
+  SavedInsts = 0;
+
+  // Match and(Val, Cst) or Trunc(Val)
+  Value *Val;
+  ConstantInt *AndCst = nullptr;
+  if (!match(&IN, m_Trunc(m_Value(Val))) &&
+      !match(&IN, m_And(m_Value(Val), m_ConstantInt(AndCst))))
+    return false;
+  addSavedInst(IN, true);
+
+  // Get initial MR.
+  ModRange MR;
+  MR.Shift = 0;
+  unsigned ResShift = 0;
+  unsigned TBits = DL->getTypeSizeInBits(Val->getType());
+  if (AndCst) {
+    APInt AI = AndCst->getValue();
+    // Check AI has consecutive one bits. The consecutive bits are the
+    // range to be used. If the num of trailing zeros are not zero,
+    // remember the num in ResShift and the val after shrinking needs
+    // to be shifted accordingly.
+    MR.Shift = AI.countTrailingZeros();
+    ResShift = MR.Shift;
+    if (!(AI.lshr(MR.Shift) + 1).isPowerOf2() || MR.Shift == TBits)
+      return false;
+    MR.Width = AI.getActiveBits() - MR.Shift;
+  } else {
+    MR.Width = DL->getTypeSizeInBits(Ty);
+  }
+
+  // Match a series of LShr or Shl. Adjust MR.Shift accordingly.
+  // Note we have to be careful that valid bits may be cleared during
+  // back-and-force shifts. We use a all-one Mask APInt to simulate
+  // the shifts and track the valid bits after shifts.
+  Value *Opd;
+  bool isLShr;
+  unsigned NewShift = MR.Shift;
+  APInt Mask(TBits, -1);
+  ConstantInt *Cst;
+  // Record the series of LShr or Shl in ShiftRecs.
+  SmallVector<std::pair<bool, unsigned>, 4> ShiftRecs;
+  while ((isLShr = match(Val, m_LShr(m_Value(Opd), m_ConstantInt(Cst)))) ||
+         match(Val, m_Shl(m_Value(Opd), m_ConstantInt(Cst)))) {
+    addSavedInst(*cast<Instruction>(Val));
+    NewShift = isLShr ? (NewShift + Cst->getZExtValue())
+                      : (NewShift - Cst->getZExtValue());
+    ShiftRecs.push_back({isLShr, Cst->getZExtValue()});
+    Val = Opd;
+  }
+  // Iterate ShiftRecs in reverse order. Simulate the shifts of Mask.
+  for (auto SR : make_range(ShiftRecs.rbegin(), ShiftRecs.rend())) {
+    if (SR.first)
+      Mask = Mask.lshr(SR.second);
+    else
+      Mask = Mask.shl(SR.second);
+  }
+  // If all the bits in MR are one in Mask, MR is valid after the shifts.
+  if (!CompareAPIntWithModRange(Mask, MR, false))
+    return false;
+  MR.Shift = NewShift;
+
+  // Specify the proper MR we want to handle.
+  if (MR.Shift + MR.Width > TBits)
+    return false;
+  if (MR.Width != 8 && MR.Width != 16 && MR.Width != 32)
+    return false;
+  if (MR.Shift % 8 != 0)
+    return false;
+
+  // Match other Binary operators like Add/Sub/And/Xor/Or or pattern like
+  // And(Or(...And(Or(Val, Cst)))) and find the final load.
+  SmallVector<Instruction *, 8> Insts;
+  if (!matchInstsToShrink(Val, MR, Insts))
+    return false;
+
+  // Expect the final load has been found here.
+  assert(!Insts.empty() && "Expect Insts to be not empty");
+  LoadInst *LI = dyn_cast<LoadInst>(Insts.back());
+  assert(LI && "Last elem in Insts should be a LoadInst");
+  if (LI->isVolatile() || !LI->isUnordered())
+    return false;
+
+  IntegerType *NewTy = Type::getIntNTy(*Context, MR.Width);
+  if (TLI->isNarrowingExpensive(EVT::getEVT(Ty), EVT::getEVT(NewTy)))
+    return false;
+
+  // Do legality check to ensure the range of shrinked load is not clobbered
+  // from *LI to IN.
+  IRBuilder<> Builder(*Context);
+  Builder.SetInsertPoint(&IN);
+  unsigned StOffset = computeStOffset(MR, TBits, *DL);
+  Value *NewPtr =
+      createNewPtr(LI->getPointerOperand(), StOffset, NewTy, Builder);
+  Instruction *NewInsertPt = nullptr;
+  if (hasClobberBetween(*LI, IN, MemoryLocation(NewPtr, MR.Width / 8),
+                        NewInsertPt)) {
+    RecursivelyDeleteTriviallyDeadInstructions(NewPtr);
+    return false;
+  }
+  if (!isBeneficial(ResShift, Insts)) {
+    RecursivelyDeleteTriviallyDeadInstructions(NewPtr);
+    return false;
+  }
+
+  // Start to shrink instructions in Ops.
+  Value *NewVal =
+      shrinkInsts(NewPtr, MR, Insts, NewInsertPt, ResShift, Builder);
+  DEBUG(dbgs() << "MemShrink: Replace" << IN << " with" << *NewVal << "\n");
+  IN.replaceAllUsesWith(NewVal);
+  markCandForDeletion(IN);
+  NumLoadShrinked++;
+  return true;
+}
+
+/// Check insts in CandidatesToErase. If they are dead, remove the dead
+/// instructions recursively and clean up Memory SSA.
+bool MemAccessShrinkingPass::removeDeadInsts() {
+  SmallVector<Instruction *, 16> DeadInsts;
+  bool Changed = false;
+
+  for (Instruction *I : CandidatesToErase) {
+    if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLInfo))
+      continue;
+    assert(DeadInsts.empty() && "DeadInsts is empty at the beginning");
+    DeadInsts.push_back(I);
+
+    do {
+      I = DeadInsts.pop_back_val();
+
+      // Null out all of the instruction's operands to see if any operand
+      // becomes dead as we go.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *OpV = I->getOperand(i);
+        I->setOperand(i, nullptr);
+
+        if (!OpV->use_empty())
+          continue;
+
+        // If the operand is an instruction that became dead as we nulled out
+        // the operand, and if it is 'trivially' dead, delete it in a future
+        // loop iteration.
+        if (Instruction *OpI = dyn_cast<Instruction>(OpV)) {
+          if (isInstructionTriviallyDead(OpI, TLInfo))
+            DeadInsts.push_back(OpI);
+          else
+            CandidatesToErase.insert(OpI);
+        }
+      }
+
+      MemoryAccess *MemAcc = MSSA->getMemoryAccess(I);
+      if (MemAcc)
+        MSSAUpdater->removeMemoryAccess(MemAcc);
+
+      CandidatesToErase.erase(I);
+      I->eraseFromParent();
+
+      Changed = true;
+    } while (!DeadInsts.empty());
+  }
+  return Changed;
+}
+
+/// Perform the memaccess shrinking optimization for the given function.
+bool MemAccessShrinkingPass::runOnFunction(Function &Fn) {
+  if (skipFunction(Fn))
+    return false;
+
+  DL = &Fn.getParent()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSAWalker = MSSA->getWalker();
+  MSSAUpdater = make_unique<MemorySSAUpdater>(MSSA);
+  CandidatesToErase.clear();
+
+  Context = &Fn.getContext();
+
+  if (TM)
+    TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+
+  // Iterates the Fn until nothing is changed.
+  MSSA->verifyMemorySSA();
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    MadeChange = false;
+    for (BasicBlock *BB : post_order(&Fn)) {
+      auto CurInstIterator = BB->rbegin();
+      while (CurInstIterator != BB->rend())
+        MadeChange |= tryShrinkOnInst(*CurInstIterator++);
+    }
+    MadeChange |= removeDeadInsts();
+    EverMadeChange |= MadeChange;
+  }
+
+  return EverMadeChange;
+}
Index: lib/CodeGen/TargetPassConfig.cpp
===================================================================
--- lib/CodeGen/TargetPassConfig.cpp
+++ lib/CodeGen/TargetPassConfig.cpp
@@ -68,6 +68,8 @@
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
     cl::desc("Disable Machine Sinking"));
+static cl::opt<bool> DisableMemAccessShrinking("disable-memaccess-shrinking",
+    cl::Hidden, cl::desc("Disable MemAccessShrinking"));
 static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
     cl::desc("Disable Loop Strength Reduction Pass"));
 static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
@@ -475,6 +477,9 @@
   if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
     addPass(createConstantHoistingPass());
 
+  if (getOptLevel() != CodeGenOpt::None && !DisableMemAccessShrinking)
+    addPass(createMemAccessShrinkingPass(TM));
+
   if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
     addPass(createPartiallyInlineLibCallsPass());
 
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -284,6 +284,8 @@
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
+  bool isNarrowingExpensive(EVT VT1, EVT VT2) const override;
+
   /// Return true if the given shuffle mask can be codegen'd directly, or if it
   /// should be stack expanded.
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6718,6 +6718,10 @@
   return SDValue();
 }
 
+bool AArch64TargetLowering::isNarrowingExpensive(EVT VT1, EVT VT2) const {
+  return false;
+}
+
 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                                EVT VT) const {
   if (VT.getVectorNumElements() == 4 &&
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -428,6 +428,7 @@
 
     Sched::Preference getSchedulingPreference(SDNode *N) const override;
 
+    bool isNarrowingExpensive(EVT VT1, EVT VT2) const override;
     bool
     isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -6432,6 +6432,10 @@
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
+bool ARMTargetLowering::isNarrowingExpensive(EVT VT1, EVT VT2) const {
+  return false;
+}
+
 /// isShuffleMaskLegal - Targets can use this to indicate that they only
 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -939,6 +939,9 @@
     /// from i32 to i8 but not from i32 to i16.
     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
+    /// Return true if it's expensive to narrow operations of type VT1 to VT2.
+    bool isNarrowingExpensive(EVT VT1, EVT VT2) const override;
+
     /// Given an intrinsic, checks if on the target the intrinsic will need to map
     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
     /// true and stores the intrinsic information into the IntrinsicInfo that was
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -24338,6 +24338,10 @@
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
 }
 
+bool X86TargetLowering::isNarrowingExpensive(EVT VT1, EVT VT2) const {
+  return false;
+}
+
 /// Targets can use this to indicate that they only support *some*
 /// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
Index: test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
===================================================================
--- test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
+++ test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -9,12 +9,11 @@
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
-; CHECK: ldr x3, [sp]
-; CHECK: mov x0, x3
+; CHECK: ldr w0, [sp]
 ; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
+; CHECK: ldrh w0, [sp, #8]
 ; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
+; CHECK: ldrb w0, [sp, #12]
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: ldrb w0, [sp, #15]
 ; CHECK: add sp, sp, #16
@@ -402,10 +401,8 @@
 define void @stack_trunc() nounwind {
 ; CHECK-LABEL: stack_trunc
 ; CHECK: sub  sp, sp, #16
-; CHECK: ldr  [[REG:x[0-9]+]], [sp]
-; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
-; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: ldrb [[REG:w[0-9]+]], [sp]
+; CHECK: strb [[REG]], [sp, #15]
 ; CHECK: add  sp, sp, #16
   %a = alloca i8, align 1
   %b = alloca i64, align 8
Index: test/CodeGen/AArch64/arm64-fast-isel-gv.ll
===================================================================
--- test/CodeGen/AArch64/arm64-fast-isel-gv.ll
+++ test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -25,7 +25,7 @@
 ; CHECK: add  [[REG7:x[0-9]+]], [[REG6]], [[REG3]]
 ; CHECK: and  [[REG8:x[0-9]+]], [[REG7]], #0xffff
 ; CHECK: str  [[REG8]], {{\[}}[[REG1]]{{\]}}
-; CHECK: ldr  {{x[0-9]+}}, {{\[}}[[REG1]]{{\]}}
+; CHECK: ldr  {{w[0-9]+}}, {{\[}}[[REG1]]{{\]}}
   %0 = load i64, i64* @seed, align 8
   %mul = mul nsw i64 %0, 1309
   %add = add nsw i64 %mul, 13849
Index: test/CodeGen/ARM/illegal-bitfield-loadstore.ll
===================================================================
--- test/CodeGen/ARM/illegal-bitfield-loadstore.ll
+++ test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -12,13 +12,9 @@
 ;
 ; BE-LABEL: i24_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r1, [r0]
-; BE-NEXT:    ldrb r2, [r0, #2]
-; BE-NEXT:    orr r1, r2, r1, lsl #8
+; BE-NEXT:    ldrh r1, [r0, #1]
 ; BE-NEXT:    orr r1, r1, #384
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    strh r1, [r0, #1]
 ; BE-NEXT:    mov pc, lr
   %aa = load i24, i24* %a, align 1
   %b = or i24 %aa, 384
@@ -29,36 +25,23 @@
 define void @i24_and_or(i24* %a) {
 ; LE-LABEL: i24_and_or:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldrb r1, [r0, #2]
-; LE-NEXT:    ldrh r2, [r0]
-; LE-NEXT:    orr r1, r2, r1, lsl #16
-; LE-NEXT:    ldr r2, .LCPI1_0
+; LE-NEXT:    ldrh r1, [r0]
+; LE-NEXT:    mov r2, #16256
+; LE-NEXT:    orr r2, r2, #49152
 ; LE-NEXT:    orr r1, r1, #384
 ; LE-NEXT:    and r1, r1, r2
 ; LE-NEXT:    strh r1, [r0]
-; LE-NEXT:    lsr r1, r1, #16
-; LE-NEXT:    strb r1, [r0, #2]
 ; LE-NEXT:    mov pc, lr
-; LE-NEXT:    .p2align 2
-; LE-NEXT:  @ BB#1:
-; LE-NEXT:  .LCPI1_0:
-; LE-NEXT:    .long 16777088 @ 0xffff80
 ;
 ; BE-LABEL: i24_and_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r1, [r0]
-; BE-NEXT:    mov r2, #384
-; BE-NEXT:    orr r1, r2, r1, lsl #8
-; BE-NEXT:    ldr r2, .LCPI1_0
+; BE-NEXT:    ldrh r1, [r0, #1]
+; BE-NEXT:    mov r2, #16256
+; BE-NEXT:    orr r2, r2, #49152
+; BE-NEXT:    orr r1, r1, #384
 ; BE-NEXT:    and r1, r1, r2
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
-; BE-NEXT:    mov pc, lr
-; BE-NEXT:    .p2align 2
-; BE-NEXT:  @ BB#1:
-; BE-NEXT:  .LCPI1_0:
-; BE-NEXT:    .long 16777088 @ 0xffff80
+; BE-NEXT:    strh r1, [r0, #1]
+; BE-NEXT:    mov pc, lr
   %b = load i24, i24* %a, align 1
   %c = and i24 %b, -128
   %d = or i24 %c, 384
@@ -69,37 +52,19 @@
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; LE-LABEL: i24_insert_bit:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldrb r2, [r0, #2]
-; LE-NEXT:    ldrh r3, [r0]
-; LE-NEXT:    orr r2, r3, r2, lsl #16
-; LE-NEXT:    ldr r3, .LCPI2_0
-; LE-NEXT:    and r2, r2, r3
-; LE-NEXT:    lsr r3, r2, #16
-; LE-NEXT:    orr r1, r2, r1, lsl #13
-; LE-NEXT:    strb r3, [r0, #2]
-; LE-NEXT:    strh r1, [r0]
+; LE-NEXT:    ldrb r2, [r0, #1]
+; LE-NEXT:    and r2, r2, #223
+; LE-NEXT:    orr r1, r2, r1, lsl #5
+; LE-NEXT:    strb r1, [r0, #1]
 ; LE-NEXT:    mov pc, lr
-; LE-NEXT:    .p2align 2
-; LE-NEXT:  @ BB#1:
-; LE-NEXT:  .LCPI2_0:
-; LE-NEXT:    .long 16769023 @ 0xffdfff
 ;
 ; BE-LABEL: i24_insert_bit:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r2, [r0]
-; BE-NEXT:    ldrb r3, [r0, #2]
-; BE-NEXT:    orr r2, r3, r2, lsl #8
-; BE-NEXT:    ldr r3, .LCPI2_0
-; BE-NEXT:    and r2, r2, r3
-; BE-NEXT:    orr r1, r2, r1, lsl #13
-; BE-NEXT:    strb r2, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
-; BE-NEXT:    mov pc, lr
-; BE-NEXT:    .p2align 2
-; BE-NEXT:  @ BB#1:
-; BE-NEXT:  .LCPI2_0:
-; BE-NEXT:    .long 16769023 @ 0xffdfff
+; BE-NEXT:    ldrb r2, [r0, #1]
+; BE-NEXT:    and r2, r2, #223
+; BE-NEXT:    orr r1, r2, r1, lsl #5
+; BE-NEXT:    strb r1, [r0, #1]
+; BE-NEXT:    mov pc, lr
   %extbit = zext i1 %bit to i24
   %b = load i24, i24* %a, align 1
   %extbit.shl = shl nuw nsw i24 %extbit, 13
@@ -119,19 +84,9 @@
 ;
 ; BE-LABEL: i56_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, r0
-; BE-NEXT:    ldr r12, [r0]
-; BE-NEXT:    ldrh r2, [r1, #4]!
-; BE-NEXT:    ldrb r3, [r1, #2]
-; BE-NEXT:    orr r2, r3, r2, lsl #8
-; BE-NEXT:    orr r2, r2, r12, lsl #24
-; BE-NEXT:    orr r2, r2, #384
-; BE-NEXT:    lsr r3, r2, #8
-; BE-NEXT:    strb r2, [r1, #2]
-; BE-NEXT:    strh r3, [r1]
-; BE-NEXT:    bic r1, r12, #255
-; BE-NEXT:    orr r1, r1, r2, lsr #24
-; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    ldr r1, [r0, #3]
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    str r1, [r0, #3]
 ; BE-NEXT:    mov pc, lr
   %aa = load i56, i56* %a
   %b = or i56 %aa, 384
@@ -150,19 +105,10 @@
 ;
 ; BE-LABEL: i56_and_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, r0
-; BE-NEXT:    mov r3, #128
-; BE-NEXT:    ldrh r2, [r1, #4]!
-; BE-NEXT:    strb r3, [r1, #2]
-; BE-NEXT:    lsl r2, r2, #8
-; BE-NEXT:    ldr r12, [r0]
-; BE-NEXT:    orr r2, r2, r12, lsl #24
-; BE-NEXT:    orr r2, r2, #384
-; BE-NEXT:    lsr r3, r2, #8
-; BE-NEXT:    strh r3, [r1]
-; BE-NEXT:    bic r1, r12, #255
-; BE-NEXT:    orr r1, r1, r2, lsr #24
-; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    ldr r1, [r0, #3]
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    bic r1, r1, #127
+; BE-NEXT:    str r1, [r0, #3]
 ; BE-NEXT:    mov pc, lr
 
   %b = load i56, i56* %a, align 1
@@ -175,31 +121,18 @@
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; LE-LABEL: i56_insert_bit:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldr r2, [r0]
-; LE-NEXT:    bic r2, r2, #8192
-; LE-NEXT:    orr r1, r2, r1, lsl #13
-; LE-NEXT:    str r1, [r0]
+; LE-NEXT:    ldr r2, [r0, #1]
+; LE-NEXT:    bic r2, r2, #32
+; LE-NEXT:    orr r1, r2, r1, lsl #5
+; LE-NEXT:    str r1, [r0, #1]
 ; LE-NEXT:    mov pc, lr
 ;
 ; BE-LABEL: i56_insert_bit:
 ; BE:       @ BB#0:
-; BE-NEXT:    .save {r11, lr}
-; BE-NEXT:    push {r11, lr}
-; BE-NEXT:    mov r2, r0
-; BE-NEXT:    ldr lr, [r0]
-; BE-NEXT:    ldrh r12, [r2, #4]!
-; BE-NEXT:    ldrb r3, [r2, #2]
-; BE-NEXT:    orr r12, r3, r12, lsl #8
-; BE-NEXT:    orr r3, r12, lr, lsl #24
-; BE-NEXT:    bic r3, r3, #8192
-; BE-NEXT:    orr r1, r3, r1, lsl #13
-; BE-NEXT:    strb r3, [r2, #2]
-; BE-NEXT:    lsr r3, r1, #8
-; BE-NEXT:    strh r3, [r2]
-; BE-NEXT:    bic r2, lr, #255
-; BE-NEXT:    orr r1, r2, r1, lsr #24
-; BE-NEXT:    str r1, [r0]
-; BE-NEXT:    pop {r11, lr}
+; BE-NEXT:    ldr r2, [r0, #2]
+; BE-NEXT:    bic r2, r2, #32
+; BE-NEXT:    orr r1, r2, r1, lsl #5
+; BE-NEXT:    str r1, [r0, #2]
 ; BE-NEXT:    mov pc, lr
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1
Index: test/CodeGen/ARM/load-shrink.ll
===================================================================
--- test/CodeGen/ARM/load-shrink.ll
+++ test/CodeGen/ARM/load-shrink.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check bitfield load is shrinked properly in cases below.
+
+; The bitfield store can be shrinked from i64 to i16.
+; CHECK-LABEL: @load_and(
+; CHECK: %cast = bitcast i64* %ptr to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 8
+; CHECK: %trunc.zext = zext i16 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = and i64 %bf.load, 65535
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_trunc(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %load.trunc = load i8, i8* %cast, align 8
+; CHECK: %cmp = icmp eq i8 %load.trunc, 1
+
+define i1 @load_trunc(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = trunc i64 %bf.load to i8
+  %cmp = icmp eq i8 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %trunc.zext = zext i8 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.lshr = lshr i64 %bf.load, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr_add(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr_add(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %add = add i64 %bf.load, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_ops(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_ops(i64* %ptr, i64 %value) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.value = and i64 %value, 65535
+  %bf.clear1 = and i64 %bf.load, -65536
+  %bf.set = or i64 %bf.value, %bf.clear1
+  %add = add i64 %bf.set, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; It doesn't worth to do the shrink because %bf.load has multiple uses
+; and the shrink here doesn't save instructions.
+; CHECK-LABEL: @load_trunc_multiuses
+; CHECK: %bf.load = load i64, i64* %ptr, align 8
+; CHECK: %bf.trunc = trunc i64 %bf.load to i16
+; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3
+; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000
+; CHECK: %cmp = and i1 %cmp1, %cmp2
+
+define i1 @load_trunc_multiuses(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.trunc = trunc i64 %bf.load to i16
+  %cmp1 = icmp eq i16 %bf.trunc, 3
+  %cmp2 = icmp ult i64 %bf.load, 1500000
+  %cmp = and i1 %cmp1, %cmp2
+  ret i1 %cmp
+}
+
Index: test/CodeGen/ARM/store-shrink.ll
===================================================================
--- test/CodeGen/ARM/store-shrink.ll
+++ test/CodeGen/ARM/store-shrink.ll
@@ -0,0 +1,364 @@
+; RUN: opt < %s -mtriple=arm-eabi -memaccessshrink -S | FileCheck %s
+; Check bitfield store is shrinked properly in cases below.
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %t0 = trunc i64 %conv to i16
+; CHECK: %bf.value = and i16 %t0, 255
+; CHECK: %trunc = trunc i16 %bf.value to i8
+; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %t0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %t0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16
+; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %bf.value = and i64 %conv, 4294967295
+; CHECK: %trunc = trunc i64 %bf.value to i32
+; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: %t0 = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %t0, 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: %bf.value = and i32 %n, 65535
+; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK: if.end:
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16 
+; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %bf.shl = shl i32 %bf.value, 16
+; CHECK: %lshr = lshr i32 %bf.shl, 16
+; CHECK: %trunc = trunc i32 %lshr to i16
+; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1 
+;
+define void @i24_or(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i8.
+; CHECK-LABEL: @i24_and(
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -7
+; CHECK: store i8 %and.trunc, i8* %uglygep, align 1
+;
+define void @i24_and(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = and i24 %aa, -1537
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_xor(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %xor.trunc = xor i16 %load.trunc, 384
+; CHECK: store i16 %xor.trunc, i16* %cast, align 1
+;
+define void @i24_xor(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = xor i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_and_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i24_and_or(i24* %a) {
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i24 store to i8.
+; CHECK-LABEL: @i24_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i24
+; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %lshr = lshr i24 %extbit.shl, 8
+; CHECK: %trunc = trunc i24 %lshr to i8
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { 
+  %extbit = zext i1 %bit to i24 
+  %b = load i24, i24* %a, align 1 
+  %extbit.shl = shl nuw nsw i24 %extbit, 13 
+  %c = and i24 %b, -8193 
+  %d = or i24 %c, %extbit.shl 
+  store i24 %d, i24* %a, align 1 
+  ret void 
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or(
+; CHECK: %cast = bitcast i56* %a to i32*
+; CHECK: %load.trunc = load i32, i32* %cast, align 1
+; CHECK: %or.trunc = or i32 %load.trunc, 384
+; CHECK: store i32 %or.trunc, i32* %cast, align 1
+;
+define void @i56_or(i56* %a) {
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_and_or(
+; CHECK: %cast = bitcast i56* %a to i32*
+; CHECK: %load.trunc = load i32, i32* %cast, align 1
+; CHECK: %and.trunc = and i32 %load.trunc, -128
+; CHECK: %or.trunc = or i32 %and.trunc, 384
+; CHECK: store i32 %or.trunc, i32* %cast, align 1
+;
+define void @i56_and_or(i56* %a) {
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store to i8.
+; CHECK-LABEL: @i56_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i56
+; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %load.trunc = load i32, i32* %cast1, align 1
+; CHECK: %lshr = lshr i56 %extbit.shl, 8
+; CHECK: %trunc = trunc i56 %lshr to i32
+; CHECK: %and.trunc = and i32 %load.trunc, -33
+; CHECK: %or.trunc = or i32 %and.trunc, %trunc
+; CHECK: store i32 %or.trunc, i32* %cast1, align 1
+;
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or_alg2(
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %load.trunc = load i32, i32* %cast1, align 2
+; CHECK: %or.trunc = or i32 %load.trunc, 272
+; CHECK: store i32 %or.trunc, i32* %cast1, align 2
+;
+define void @i56_or_alg2(i56* %a) {
+  %aa = load i56, i56* %a, align 2
+  %b = or i56 %aa, 17825792
+  store i56 %b, i56* %a, align 2
+  ret void
+}
+
+
Index: test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
===================================================================
--- test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -13,9 +13,9 @@
 ; SOURCE-SCHED: xorl
 ; SOURCE-SCHED: cmpl
 ; SOURCE-SCHED: setg
-; SOURCE-SCHED: movb
 ; SOURCE-SCHED: xorl
 ; SOURCE-SCHED: subl
+; SOURCE-SCHED: movb
 ; SOURCE-SCHED: testb
 ; SOURCE-SCHED: jne
 	%0 = load i32, i32* @g_5, align 4		; <i32> [#uses=1]
Index: test/CodeGen/X86/constant-combines.ll
===================================================================
--- test/CodeGen/X86/constant-combines.ll
+++ test/CodeGen/X86/constant-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -disable-memaccess-shrinking < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
Index: test/CodeGen/X86/i16lshr8pat.ll
===================================================================
--- test/CodeGen/X86/i16lshr8pat.ll
+++ test/CodeGen/X86/i16lshr8pat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
+; RUN: llc -march=x86 -disable-memaccess-shrinking -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll
===================================================================
--- test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -4,13 +4,7 @@
 define void @i24_or(i24* %a) {
 ; CHECK-LABEL: i24_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    movb %cl, 2(%rdi)
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
-; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; CHECK-NEXT:    retq
   %aa = load i24, i24* %a, align 1
   %b = or i24 %aa, 384
@@ -22,14 +16,9 @@
 ; CHECK-LABEL: i24_and_or:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
-; CHECK-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
-; CHECK-NEXT:    movw %cx, (%rdi)
-; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    movb %cl, 2(%rdi)
+; CHECK-NEXT:    orl $384, %eax # imm = 0x180
+; CHECK-NEXT:    andl $65408, %eax # imm = 0xFF80
+; CHECK-NEXT:    movw %ax, (%rdi)
 ; CHECK-NEXT:    retq
   %b = load i24, i24* %a, align 1
   %c = and i24 %b, -128
@@ -41,17 +30,11 @@
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i24_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl (%rdi), %ecx
-; CHECK-NEXT:    movzbl 2(%rdi), %edx
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shll $13, %eax
-; CHECK-NEXT:    andl $16769023, %edx # imm = 0xFFDFFF
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    shrl $16, %edx
-; CHECK-NEXT:    movb %dl, 2(%rdi)
-; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    movb 1(%rdi), %al
+; CHECK-NEXT:    shlb $5, %sil
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    movb %al, 1(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i24
   %b = load i24, i24* %a, align 1
@@ -65,19 +48,7 @@
 define void @i56_or(i56* %a) {
 ; CHECK-LABEL: i56_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl 4(%rdi), %eax
-; CHECK-NEXT:    movzbl 6(%rdi), %ecx
-; CHECK-NEXT:    movl (%rdi), %edx
-; CHECK-NEXT:    movb %cl, 6(%rdi)
-; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    orq $384, %rdx # imm = 0x180
-; CHECK-NEXT:    movl %edx, (%rdi)
-; CHECK-NEXT:    shrq $32, %rdx
-; CHECK-NEXT:    movw %dx, 4(%rdi)
+; CHECK-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; CHECK-NEXT:    retq
   %aa = load i56, i56* %a, align 1
   %b = or i56 %aa, 384
@@ -88,22 +59,10 @@
 define void @i56_and_or(i56* %a) {
 ; CHECK-LABEL: i56_and_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl 4(%rdi), %eax
-; CHECK-NEXT:    movzbl 6(%rdi), %ecx
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    orq %rcx, %rax
-; CHECK-NEXT:    orq $384, %rax # imm = 0x180
-; CHECK-NEXT:    movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80
-; CHECK-NEXT:    andq %rax, %rcx
-; CHECK-NEXT:    movl %ecx, (%rdi)
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movw %ax, 4(%rdi)
-; CHECK-NEXT:    shrq $48, %rcx
-; CHECK-NEXT:    movb %cl, 6(%rdi)
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    orl $384, %eax # imm = 0x180
+; CHECK-NEXT:    andl $65408, %eax # imm = 0xFF80
+; CHECK-NEXT:    movw %ax, (%rdi)
 ; CHECK-NEXT:    retq
   %b = load i56, i56* %a, align 1
   %c = and i56 %b, -128
@@ -115,23 +74,11 @@
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i56_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl 4(%rdi), %ecx
-; CHECK-NEXT:    movzbl 6(%rdi), %edx
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shlq $32, %rdx
-; CHECK-NEXT:    movl (%rdi), %ecx
-; CHECK-NEXT:    orq %rdx, %rcx
-; CHECK-NEXT:    shlq $13, %rax
-; CHECK-NEXT:    movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rdx, %ra
-; CHECK-NEXT:    movl %eax, (%rdi)
-; CHECK-NEXT:    shrq $48, %rdx
-; CHECK-NEXT:    movb %dl, 6(%rdi)
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movw %ax, 4(%rdi)
+; CHECK-NEXT:    movb 1(%rdi), %al
+; CHECK-NEXT:    shlb $5, %sil
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    movb %al, 1(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1
Index: test/CodeGen/X86/load-shrink.ll
===================================================================
--- test/CodeGen/X86/load-shrink.ll
+++ test/CodeGen/X86/load-shrink.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check bitfield load is shrinked properly in cases below.
+
+; The bitfield store can be shrinked from i64 to i16.
+; CHECK-LABEL: @load_and(
+; CHECK: %cast = bitcast i64* %ptr to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 8
+; CHECK: %trunc.zext = zext i16 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = and i64 %bf.load, 65535
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_trunc(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %load.trunc = load i8, i8* %cast, align 8
+; CHECK: %cmp = icmp eq i8 %load.trunc, 1
+
+define i1 @load_trunc(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.clear = trunc i64 %bf.load to i8
+  %cmp = icmp eq i8 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %trunc.zext = zext i8 %load.trunc to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.lshr = lshr i64 %bf.load, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_and_shr_add(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_and_shr_add(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %add = add i64 %bf.load, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; The bitfield store can be shrinked from i64 to i8.
+; CHECK-LABEL: @load_ops(
+; CHECK: %cast = bitcast i64* %ptr to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 6
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 2
+; CHECK: %[[ADD:.*]] = add i8 %load.trunc, 1
+; CHECK: %trunc.zext = zext i8 %[[ADD]] to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext, 1
+
+define i1 @load_ops(i64* %ptr, i64 %value) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.value = and i64 %value, 65535
+  %bf.clear1 = and i64 %bf.load, -65536
+  %bf.set = or i64 %bf.value, %bf.clear1
+  %add = add i64 %bf.set, 281474976710656
+  %bf.lshr = lshr i64 %add, 48
+  %bf.clear = and i64 %bf.lshr, 255
+  %cmp = icmp eq i64 %bf.clear, 1
+  ret i1 %cmp
+}
+
+; It doesn't worth to do the shrink because %bf.load has multiple uses
+; and the shrink here doesn't save instructions.
+; CHECK-LABEL: @load_trunc_multiuses
+; CHECK: %bf.load = load i64, i64* %ptr, align 8
+; CHECK: %bf.trunc = trunc i64 %bf.load to i16
+; CHECK: %cmp1 = icmp eq i16 %bf.trunc, 3
+; CHECK: %cmp2 = icmp ult i64 %bf.load, 1500000
+; CHECK: %cmp = and i1 %cmp1, %cmp2
+
+define i1 @load_trunc_multiuses(i64* %ptr) {
+entry:
+  %bf.load = load i64, i64* %ptr, align 8
+  %bf.trunc = trunc i64 %bf.load to i16
+  %cmp1 = icmp eq i16 %bf.trunc, 3
+  %cmp2 = icmp ult i64 %bf.load, 1500000
+  %cmp = and i1 %cmp1, %cmp2
+  ret i1 %cmp
+}
+
Index: test/CodeGen/X86/load-slice.ll
===================================================================
--- test/CodeGen/X86/load-slice.ll
+++ test/CodeGen/X86/load-slice.ll
@@ -110,20 +110,23 @@
   ret i32 %res
 }
 
-; Check that we do not optimize overlapping slices.
 ;
-; The 64-bits should NOT have been split in as slices are overlapping.
+; The slices are overlapping, but it can still be split.
 ; First slice uses bytes numbered 0 to 3.
 ; Second slice uses bytes numbered 6 and 7.
 ; Third slice uses bytes numbered 4 to 7.
 ;
 ; STRESS-LABEL: t3:
-; STRESS: shrq $48
-; STRESS: shrq $32
+; STRESS: movzwl 6(%rdi,%rsi,8), %eax
+; STRESS-NEXT: addl (%rdi,%rsi,8), %eax
+; STRESS-NEXT: addl 4(%rdi,%rsi,8), %eax
 ;
 ; REGULAR-LABEL: t3:
-; REGULAR: shrq $48
-; REGULAR: shrq $32
+; REGULAR: movq	(%rdi,%rsi,8), %rcx
+; REGULAR-NEXT: movq %rcx, %rax
+; REGULAR-NEXT: shrq $48, %rax
+; REGULAR-NEXT: addl %ecx, %eax
+; REGULAR-NEXT: addl 4(%rdi,%rsi,8), %eax
 define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
   %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
   %bitcast = bitcast %class.Complex* %arrayidx to i64*
Index: test/CodeGen/X86/lsr-loop-exit-cond.ll
===================================================================
--- test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -2,14 +2,16 @@
 ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK-LABEL: t:
+; CHECK: [[LABEL:LBB.*]]:
 ; CHECK: movl (%r9,%rax,4), %e{{..}}
-; CHECK-NEXT: testq
-; CHECK-NEXT: jne
+; CHECK: testq
+; CHECK: jne [[LABEL]]
 
 ; ATOM-LABEL: t:
+; ATOM: [[LABEL:LBB.*]]:
 ; ATOM: movl (%r9,%r{{.+}},4), %e{{..}}
-; ATOM-NEXT: testq
-; ATOM-NEXT: jne
+; ATOM: testq
+; ATOM: jne [[LABEL]]
 
 @Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
 @Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
Index: test/CodeGen/X86/mem-access-shrink.ll
===================================================================
--- test/CodeGen/X86/mem-access-shrink.ll
+++ test/CodeGen/X86/mem-access-shrink.ll
@@ -0,0 +1,216 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check the memory accesses in the test are shrinked as expected.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%class.B = type { i56, [4 x i16], i64 }
+@i = local_unnamed_addr global i64 0, align 8
+
+define zeroext i1 @foo(%class.B* nocapture %this, i8* %p1, i64 %p2) local_unnamed_addr align 2 {
+; CHECK-LABEL: @foo(
+entry:
+; CHECK: entry:
+; CHECK: %t0 = bitcast %class.B* %this to i64*
+; CHECK: %cast36 = bitcast i64* %t0 to i16*
+; CHECK: %load.trunc37 = load i16, i16* %cast36, align 8
+; CHECK: %trunc.zext38 = zext i16 %load.trunc37 to i64
+; CHECK: %cmp = icmp eq i64 %trunc.zext38, 1
+
+  %t0 = bitcast %class.B* %this to i64*
+  %bf.load = load i64, i64* %t0, align 8
+  %bf.clear = and i64 %bf.load, 65535
+  %cmp = icmp eq i64 %bf.clear, 1
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+; CHECK: if.end:
+; CHECK: %cast35 = bitcast i64* %t0 to i16*
+; CHECK: %load.trunc = load i16, i16* %cast35, align 8
+; CHECK: %[[ADD1:.*]] = add i16 %load.trunc, -1
+; CHECK: %trunc.zext = zext i16 %[[ADD1]] to i64
+; CHECK: %cast33 = bitcast i64* %t0 to i16*
+; CHECK: %trunc34 = trunc i64 %trunc.zext to i16
+; CHECK: store i16 %trunc34, i16* %cast33, align 8
+
+  %dec = add i64 %bf.load, 65535
+  %bf.value = and i64 %dec, 65535
+  %bf.clear5 = and i64 %bf.load, -65536
+  %bf.set = or i64 %bf.value, %bf.clear5
+  store i64 %bf.set, i64* %t0, align 8
+  %t1 = ptrtoint i8* %p1 to i64
+  %t2 = load i64, i64* @i, align 8
+  %cmp.i = icmp ult i64 %t2, 3
+  br i1 %cmp.i, label %if.then.i, label %if.else.i
+
+if.then.i:                                        ; preds = %if.end
+  %and.i = lshr i64 %t1, 3
+  %div.i = and i64 %and.i, 1023
+  br label %_ZNK1B5m_fn3EPv.exit
+
+if.else.i:                                        ; preds = %if.end
+  %first_page_.i = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2
+  %t3 = load i64, i64* %first_page_.i, align 8
+  %mul.i = shl i64 %t3, 13
+  %sub.i = sub i64 %t1, %mul.i
+  %div2.i = lshr i64 %sub.i, 2
+  br label %_ZNK1B5m_fn3EPv.exit
+
+_ZNK1B5m_fn3EPv.exit:                             ; preds = %if.then.i, %if.else.i
+; CHECK: _ZNK1B5m_fn3EPv.exit:
+; CHECK: %cast76 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep77 = getelementptr i8, i8* %cast76, i32 6
+; CHECK: %load.trunc78 = load i8, i8* %uglygep77, align 2
+; CHECK: %trunc.zext79 = zext i8 %load.trunc78 to i64
+; CHECK: %cmp8 = icmp eq i64 %trunc.zext79, 4
+
+  %j.0.i = phi i64 [ %div.i, %if.then.i ], [ %div2.i, %if.else.i ]
+  %conv.i = trunc i64 %j.0.i to i16
+  %bf.lshr = lshr i64 %bf.load, 48
+  %bf.clear7 = and i64 %bf.lshr, 255
+  %cmp8 = icmp eq i64 %bf.clear7, 4
+  br i1 %cmp8, label %if.else, label %if.then9
+
+if.then9:                                         ; preds = %_ZNK1B5m_fn3EPv.exit
+; CHECK: if.then9:
+; CHECK: %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %trunc.zext79
+; CHECK: store i16 %conv.i, i16* %arrayidx, align 2
+; CHECK: %cast25 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep26 = getelementptr i8, i8* %cast25, i32 6
+; CHECK: %cast71 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep72 = getelementptr i8, i8* %cast71, i32 6
+; CHECK: %load.trunc73 = load i8, i8* %uglygep72, align 2
+; CHECK: %[[ADD2:.*]] = add i8 %load.trunc73, 1
+; CHECK: %[[AND1:.*]] = and i8 %[[ADD2]], -1
+; CHECK: store i8 %[[AND1]], i8* %uglygep26, align 2
+
+  %arrayidx = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 1, i64 %bf.clear7
+  store i16 %conv.i, i16* %arrayidx, align 2
+  %inc79 = add i64 %bf.set, 281474976710656
+  %bf.shl = and i64 %inc79, 71776119061217280
+  %bf.clear18 = and i64 %bf.set, -71776119061217281
+  %bf.set19 = or i64 %bf.shl, %bf.clear18
+  store i64 %bf.set19, i64* %t0, align 8
+  br label %return
+
+if.else:                                          ; preds = %_ZNK1B5m_fn3EPv.exit
+; CHECK: if.else:
+; CHECK: %cast66 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep67 = getelementptr i8, i8* %cast66, i32 4
+; CHECK: %cast68 = bitcast i8* %uglygep67 to i16*
+; CHECK: %load.trunc69 = load i16, i16* %cast68, align 4
+; CHECK: %trunc.zext70 = zext i16 %load.trunc69 to i64
+; CHECK: %cmp24 = icmp eq i64 %trunc.zext70, 1
+
+  %bf.lshr21 = lshr i64 %bf.load, 32
+  %bf.clear22 = and i64 %bf.lshr21, 65535
+  %cmp24 = icmp eq i64 %bf.clear22, 1
+  br i1 %cmp24, label %if.else55, label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %if.else
+; CHECK: land.lhs.true:
+; CHECK: %cast61 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep62 = getelementptr i8, i8* %cast61, i32 2
+; CHECK: %cast63 = bitcast i8* %uglygep62 to i16*
+; CHECK: %load.trunc64 = load i16, i16* %cast63, align 2
+; CHECK: %trunc.zext65 = zext i16 %load.trunc64 to i64
+; CHECK: %cmp28 = icmp eq i64 %trunc.zext65, %sub
+
+  %bf.lshr26 = lshr i64 %bf.load, 16
+  %bf.clear27 = and i64 %bf.lshr26, 65535
+  %div = lshr i64 %p2, 1
+  %sub = add nsw i64 %div, -1
+  %cmp28 = icmp eq i64 %bf.clear27, %sub
+  br i1 %cmp28, label %if.else55, label %if.then29
+
+if.then29:                                        ; preds = %land.lhs.true
+  %cmp30 = icmp ult i64 %p2, 3
+  br i1 %cmp30, label %if.then31, label %if.else35
+
+if.then31:                                        ; preds = %if.then29
+; CHECK: if.then31:
+; CHECK: %mul = shl nuw nsw i64 %trunc.zext70, 3
+
+  %and = and i64 %t1, -8192
+  %mul = shl nuw nsw i64 %bf.clear22, 3
+  %add = add i64 %mul, %and
+  br label %if.end41
+
+if.else35:                                        ; preds = %if.then29
+; CHECK: if.else35:
+; CHECK: %cast56 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep57 = getelementptr i8, i8* %cast56, i32 4
+; CHECK: %cast58 = bitcast i8* %uglygep57 to i16*
+; CHECK: %load.trunc59 = load i16, i16* %cast58, align 4
+; CHECK: %trunc.zext60 = zext i16 %load.trunc59 to i64
+; CHECK: %shl = shl i64 %trunc.zext60, 6
+
+  %first_page_.i80 = getelementptr inbounds %class.B, %class.B* %this, i64 0, i32 2
+  %t4 = load i64, i64* %first_page_.i80, align 8
+  %mul.i81 = shl i64 %t4, 13
+  %conv.i82 = shl nuw nsw i64 %bf.lshr21, 6
+  %mul3.i = and i64 %conv.i82, 4194240
+  %add.i = add i64 %mul.i81, %mul3.i
+  br label %if.end41
+
+if.end41:                                         ; preds = %if.else35, %if.then31
+; CHECK: if.end41:
+; CHECK: %cast8 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep9 = getelementptr i8, i8* %cast8, i32 2
+; CHECK: %cast10 = bitcast i8* %uglygep9 to i16*
+; CHECK: %cast84 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep85 = getelementptr i8, i8* %cast84, i32 2
+; CHECK: %cast86 = bitcast i8* %uglygep85 to i16*
+; CHECK: %load.trunc87 = load i16, i16* %cast86, align 2
+; CHECK: %[[ADD3:.*]] = add i16 %load.trunc87, 1
+; CHECK: %[[AND2:.*]] = and i16 %[[ADD3]], -1
+; CHECK: %cast45 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep46 = getelementptr i8, i8* %cast45, i32 2
+; CHECK: %cast47 = bitcast i8* %uglygep46 to i16*
+; CHECK: %load.trunc48 = load i16, i16* %cast47, align 2
+; CHECK: %[[ADD4:.*]] = add i16 %load.trunc48, 1
+; CHECK: %trunc.zext49 = zext i16 %[[ADD4]] to i64
+; CHECK: store i16 %[[AND2]], i16* %cast10, align 2
+; CHECK: %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %trunc.zext49
+
+  %add.i.sink = phi i64 [ %add.i, %if.else35 ], [ %add, %if.then31 ]
+  %t5 = inttoptr i64 %add.i.sink to i16*
+  %inc4578 = add i64 %bf.set, 65536
+  %bf.shl48 = and i64 %inc4578, 4294901760
+  %bf.clear49 = and i64 %bf.set, -4294901761
+  %bf.set50 = or i64 %bf.shl48, %bf.clear49
+  store i64 %bf.set50, i64* %t0, align 8
+  %bf.lshr52 = lshr i64 %inc4578, 16
+  %bf.clear53 = and i64 %bf.lshr52, 65535
+  %arrayidx54 = getelementptr inbounds i16, i16* %t5, i64 %bf.clear53
+  store i16 %conv.i, i16* %arrayidx54, align 2
+  br label %return
+
+if.else55:                                        ; preds = %land.lhs.true, %if.else
+; CHECK: if.else55:
+; CHECK: %cast80 = bitcast i64* %t0 to i8*
+; CHECK: %uglygep81 = getelementptr i8, i8* %cast80, i32 4
+; CHECK: %cast82 = bitcast i8* %uglygep81 to i16*
+; CHECK: %load.trunc83 = load i16, i16* %cast82, align 4
+; CHECK: store i16 %load.trunc83, i16* %t6, align 2
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %lshr = lshr i64 %bf.shl63, 16
+; CHECK: %trunc = trunc i64 %lshr to i32
+; CHECK: store i32 %trunc, i32* %cast1, align 2
+
+  %conv59 = trunc i64 %bf.lshr21 to i16
+  %t6 = bitcast i8* %p1 to i16*
+  store i16 %conv59, i16* %t6, align 2
+  %bf.load61 = load i64, i64* %t0, align 8
+  %conv60 = shl i64 %j.0.i, 32
+  %bf.shl63 = and i64 %conv60, 281470681743360
+  %bf.clear64 = and i64 %bf.load61, -281474976645121
+  %bf.clear67 = or i64 %bf.clear64, %bf.shl63
+  store i64 %bf.clear67, i64* %t0, align 8
+  br label %return
+
+return:                                           ; preds = %if.then9, %if.else55, %if.end41, %entry
+  %retval.0 = phi i1 [ false, %entry ], [ true, %if.end41 ], [ true, %if.else55 ], [ true, %if.then9 ]
+  ret i1 %retval.0
+}
+
Index: test/CodeGen/X86/store-shrink.ll
===================================================================
--- test/CodeGen/X86/store-shrink.ll
+++ test/CodeGen/X86/store-shrink.ll
@@ -0,0 +1,363 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -memaccessshrink -S | FileCheck %s
+; Check bitfield store is shrinked properly in cases below.
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %t0 = trunc i64 %conv to i16
+; CHECK: %bf.value = and i16 %t0, 255
+; CHECK: %trunc = trunc i16 %bf.value to i8
+; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %t0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %t0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16
+; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %bf.value = and i64 %conv, 4294967295
+; CHECK: %trunc = trunc i64 %bf.value to i32
+; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: %t0 = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %t0, 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: %bf.value = and i32 %n, 65535
+; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK: if.end:
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16 
+; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %bf.shl = shl i32 %bf.value, 16
+; CHECK: %lshr = lshr i32 %bf.shl, 16
+; CHECK: %trunc = trunc i32 %lshr to i16
+; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1 
+;
+define void @i24_or(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i8.
+; CHECK-LABEL: @i24_and(
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -7
+; CHECK: store i8 %and.trunc, i8* %uglygep, align 1
+;
+define void @i24_and(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = and i24 %aa, -1537
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_xor(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %xor.trunc = xor i16 %load.trunc, 384
+; CHECK: store i16 %xor.trunc, i16* %cast, align 1
+;
+define void @i24_xor(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = xor i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_and_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i24_and_or(i24* %a) {
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i24 store to i8.
+; CHECK-LABEL: @i24_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i24
+; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %lshr = lshr i24 %extbit.shl, 8
+; CHECK: %trunc = trunc i24 %lshr to i8
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { 
+  %extbit = zext i1 %bit to i24 
+  %b = load i24, i24* %a, align 1 
+  %extbit.shl = shl nuw nsw i24 %extbit, 13 
+  %c = and i24 %b, -8193 
+  %d = or i24 %c, %extbit.shl 
+  store i24 %d, i24* %a, align 1 
+  ret void 
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or(
+; CHECK: %cast = bitcast i56* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i56_or(i56* %a) {
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_and_or(
+; CHECK: %cast = bitcast i56* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i56_and_or(i56* %a) {
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store to i8.
+; CHECK-LABEL: @i56_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i56
+; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %lshr = lshr i56 %extbit.shl, 8
+; CHECK: %trunc = trunc i56 %lshr to i8
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or_alg2(
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i16*
+; CHECK: %load.trunc = load i16, i16* %cast1, align 2
+; CHECK: %or.trunc = or i16 %load.trunc, 272
+; CHECK: store i16 %or.trunc, i16* %cast1, align 2
+;
+define void @i56_or_alg2(i56* %a) {
+  %aa = load i56, i56* %a, align 2
+  %b = or i56 %aa, 17825792
+  store i56 %b, i56* %a, align 2
+  ret void
+}
+
+
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -380,6 +380,7 @@
   initializeTarget(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
+  initializeMemAccessShrinkingPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeAtomicExpandPass(Registry);
   initializeRewriteSymbolsLegacyPassPass(Registry);