Index: include/llvm/IR/PatternMatch.h
===================================================================
--- include/llvm/IR/PatternMatch.h
+++ include/llvm/IR/PatternMatch.h
@@ -317,6 +317,9 @@
 /// \brief Match a ConstantInt, capturing the value if we match.
 inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 
+/// \brief Match a load instruction, capturing the value if we match.
+inline bind_ty<LoadInst> m_Load(LoadInst *&LI) { return LI; }
+
 /// \brief Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -5709,6 +5709,342 @@
   return true;
 }
 
+/// Check whether mem will be modified between \p LI and \p SI.
+static bool memModBetween(LoadInst &LI, StoreInst &SI) {
+  // Check whether mem is modified inside BB local iterator range.
+  auto memModInside = [](BasicBlock::const_iterator Start,
+                         BasicBlock::const_iterator End) -> bool {
+    for (const Instruction &Inst :
+         iterator_range<BasicBlock::const_iterator>(Start, End))
+      if (Inst.mayWriteToMemory())
+        return true;
+    return false;
+  };
+  const BasicBlock *LIBB = LI.getParent();
+  const BasicBlock *SIBB = SI.getParent();
+  BasicBlock::const_iterator LII(LI);
+  BasicBlock::const_iterator SII(SI);
+  if (LIBB == SIBB)
+    return memModInside(LII, SII);
+
+  // LIBB is different from SIBB. We neet to scan LI to the end of
+  // LIBB + begin of SIBB to SI + BasicBlocks between LIBB and SIBB.
+  if (memModInside(LII, LIBB->end()))
+    return true;
+  if (memModInside(SIBB->begin(), SII))
+    return true;
+
+  // Collect BasicBlocks to scan between LIBB and SIBB into BBSet .
+  // Limit the maximum number of BasicBlocks to 3 to protect compile time.
+  const int MaxBB = 3;
+  SmallPtrSet<const BasicBlock *, 4> BBSet;
+  SmallVector<const BasicBlock *, 4> WorkSet;
+  WorkSet.push_back(SIBB);
+  do {
+    const BasicBlock *BB = WorkSet.pop_back_val();
+    for (const BasicBlock *Pred : predecessors(BB)) {
+      if (Pred != LIBB && !BBSet.count(Pred)) {
+        BBSet.insert(Pred);
+        if (BBSet.size() > MaxBB)
+          return true;
+        WorkSet.push_back(Pred);
+      }
+    }
+  } while (!WorkSet.empty());
+
+  for (const BasicBlock *BB : BBSet) {
+    if (memModInside(BB->begin(), BB->end()))
+      return true;
+  }
+  return false;
+}
+
+/// Analyze or ((and (load P), \p Cst), \p MaskedVal). Update \p ActualModBits
+/// with the number of bits of the original load to be modified, and update
+/// \p ShiftBits with the pos of the first bit to be modified bit. If the
+/// analysis result shows we can store the MaskedVal after some change but
+/// without using original load, keep \p StoreWithoutLoad as true.
+static void analyzeOrAndPattern(Value &MaskedVal, ConstantInt &Cst,
+                                unsigned &ShiftBits, unsigned &ActualModBits,
+                                bool &StoreWithoutLoad, StoreInst &SI,
+                                const DataLayout &DL) {
+  // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We
+  // will handle patterns like either 0..01..1 or 1..10..01..1
+  APInt Mask = Cst.getValue();
+  unsigned MBitWidth = Mask.getBitWidth();
+  unsigned MaskLeadOnes = Mask.countLeadingOnes();
+  unsigned MaskTrailOnes = Mask.countTrailingOnes();
+  unsigned MaskMidZeros = !MaskLeadOnes
+                              ? Mask.countLeadingZeros()
+                              : Mask.ashr(MaskTrailOnes).countTrailingZeros();
+
+  StoreWithoutLoad = true;
+  // See if we have a continuous run of zeros.
+  if (MaskMidZeros == 0 ||
+      MaskLeadOnes + MaskMidZeros + MaskTrailOnes != MBitWidth)
+    StoreWithoutLoad = false;
+
+  // Check MaskedVal only provides nonzero bits within range from lowbits
+  // (MaskTrailOnes) to highbits (MaskTrailOnes + MaskMidZeros).
+  APInt BitMask = ~APInt::getBitsSet(MBitWidth, MaskTrailOnes,
+                                     MaskTrailOnes + MaskMidZeros);
+
+  // Find out the range in which 1 appears in MaskedVal.
+  APInt KnownOne(MBitWidth, 0), KnownZero(MBitWidth, 0);
+  computeKnownBits(&MaskedVal, KnownZero, KnownOne, DL, 0);
+
+  ActualModBits = MaskMidZeros;
+  ShiftBits = MaskTrailOnes;
+  // Check !IC.MaskedValueIsZero(MaskedVal, BitMask) by inlining the call
+  // because we want to reuse the result of computeKnownBits to compute
+  // ShiftBits and ActualModBits.
+  if ((KnownZero & BitMask) != BitMask) {
+    StoreWithoutLoad = false;
+    unsigned Lower = KnownOne.countTrailingZeros();
+    unsigned Higher = MBitWidth - KnownOne.countLeadingZeros();
+    ShiftBits = std::min(Lower, MaskTrailOnes);
+    ActualModBits = std::max(Higher, MaskTrailOnes + MaskMidZeros) - ShiftBits;
+  }
+}
+
+/// Analyze \p Val = or/xor/and ((load P), \p Cst). Update \p ActualModBits
+/// with the number of bits of the original load to be modified, and update
+/// \p ShiftBits with the pos of the first bit to be modified.
+static void analyzeBOpPattern(Value &Val, ConstantInt &Cst, unsigned &ShiftBits,
+                              unsigned &ActualModBits) {
+  APInt Mask = Cst.getValue();
+  BinaryOperator *BOP = cast<BinaryOperator>(&Val);
+  if (BOP->getOpcode() == Instruction::And)
+    Mask = ~Mask;
+
+  ShiftBits = Mask.countTrailingZeros();
+  ActualModBits = Mask.getBitWidth() - ShiftBits;
+  if (ActualModBits)
+    ActualModBits = ActualModBits - Mask.countLeadingZeros();
+}
+
+/// Update \p ActualModBits and \p ShiftBits so the updated \p ActualModBits
+/// bits can form a legal type and also cover all the modified bits.
+static void updateShiftAndModifiedBits(unsigned &ActualModBits,
+                                       unsigned &ShiftBits, unsigned TBits,
+                                       unsigned Align, LLVMContext &Context,
+                                       const DataLayout &DL,
+                                       const TargetLowering &TLI) {
+  unsigned NewModBits = PowerOf2Ceil(ActualModBits);
+  Type *NewTy = Type::getIntNTy(Context, NewModBits);
+  int NewShiftBits;
+
+  // Check if we can find a NewShiftBits for the NewModBits, so that
+  // NewShiftBits and NewModBits forms a new range covering the old
+  // modified range without worsening alignment.
+  auto coverOldRange = [&]() -> bool {
+    unsigned MAlign = MinAlign(Align, DL.getABITypeAlignment(NewTy));
+    NewShiftBits = ShiftBits - ShiftBits % (MAlign * 8);
+    while (NewShiftBits >= 0) {
+      if (NewModBits + NewShiftBits <= TBits &&
+          NewModBits + NewShiftBits >= ActualModBits + ShiftBits)
+        return true;
+      NewShiftBits -= MAlign * 8;
+    }
+    return false;
+  };
+  // See whether we can store NewTy legally.
+  auto isStoreLegalType = [&]() -> bool {
+    EVT OldEVT =
+        TLI.getValueType(DL, Type::getIntNTy(Context, PowerOf2Ceil(TBits)));
+    EVT NewEVT = TLI.getValueType(DL, NewTy);
+    return TLI.isOperationLegalOrCustom(ISD::STORE, NewEVT) ||
+           TLI.isTruncStoreLegalOrCustom(OldEVT, NewEVT);
+  };
+  // Try to find the minimal NewModBits which can form a legal type and cover
+  // all the old modified bits.
+  while (NewModBits < TBits && (!isStoreLegalType() || !coverOldRange())) {
+    NewModBits = NextPowerOf2(NewModBits);
+    NewTy = Type::getIntNTy(Context, NewModBits);
+  }
+  ActualModBits = NewModBits;
+  ShiftBits = NewShiftBits;
+}
+
+// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
+// is a byte mask indicating a consecutive number of bytes, check to see if
+// Y is known to provide just those bytes.  If so, what the original store
+// sequence is doing is "load P to V; replace some bytes in V with corresponding
+// bytes from Y via bit manipulation; then store the updated V to P". We can
+// replace the sequence with a single (narrower) store, and save the load and
+// bit operations (The first shrink transformation).
+//
+// If this is "store (or/and/xor (load P), cst), we can know the maximum range
+// where the load value will be modified and shrink the size of the store if
+// only the shrinked size can still cover the modified range (the second shrink
+// transformation). Especially when the original size is illegal and the the
+// size after shrink is legal, the transformation will be beneficial.
+//
+// For the first pattern, when the first shrink transformation fails, we can
+// still try the second one because from the first pattern, we can also know
+// the range where the load value will be modified.
+static bool reduceLoadOpsStoreWidth(StoreInst &SI, const DataLayout &DL,
+                                    const TargetLowering &TLI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+  Type *StoreTy = Val->getType();
+  if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy() || !Val->hasOneUse())
+    return false;
+
+  unsigned TBits = DL.getTypeStoreSizeInBits(StoreTy);
+  if (TBits != DL.getTypeStoreSizeInBits(StoreTy))
+    return false;
+
+  LoadInst *LI;
+  Value *MaskedVal = nullptr;
+  ConstantInt *Cst;
+  // Match "or((and (load P), cst), Y)" or "or/and/xor((load P), cst)" or
+  // their exprs after commutation.
+  bool OrAndPattern = false;
+  if (!(OrAndPattern = match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)),
+                                         m_Value(MaskedVal)))) &&
+      !match(Val, m_c_Or(m_Load(LI), m_ConstantInt(Cst))) &&
+      !match(Val, m_c_And(m_Load(LI), m_ConstantInt(Cst))) &&
+      !match(Val, m_c_Xor(m_Load(LI), m_ConstantInt(Cst))))
+    return false;
+
+  // LI should have the same address as SI.
+  if (LI->getOperand(0) != Ptr)
+    return false;
+
+  // Make sure the mem SI accesses is not modified between LI and SI.
+  if (memModBetween(*LI, SI))
+    return false;
+
+  // Ideally, we hope we can optimize the load-andor-store sequence to a
+  // single shrinked store of shr+truncated MaskedVal without accessing
+  // the original load, but if we cannot, we will try to optimize the
+  // sequence into a shrinked store of the truncated Val.
+  // For load-bop-store sequence, StoreWithoutLoad will always be false.
+  bool StoreWithoutLoad = false;
+  // ActualModBits indicates the number of bits of the original load
+  // to be modified.
+  // ShiftBits indicates the position of the first bit to be modified.
+  // Both vals will be populated in analyzeXXXPattern below.
+  unsigned ActualModBits;
+  unsigned ShiftBits;
+  if (OrAndPattern)
+    analyzeOrAndPattern(*MaskedVal, *Cst, ShiftBits, ActualModBits,
+                        StoreWithoutLoad, SI, DL);
+  else
+    analyzeBOpPattern(*Val, *Cst, ShiftBits, ActualModBits);
+
+  // Adjust ActualModBits down to be from the actual size of store.
+  if (ShiftBits > TBits) {
+    ShiftBits = 0;
+    ActualModBits = TBits;
+  } else if (ShiftBits + ActualModBits > TBits) {
+    ActualModBits = TBits - ShiftBits;
+  }
+
+  unsigned StOffset;
+  if (StoreWithoutLoad) {
+    // Get the offset from Ptr for the shrinked store.
+    if (DL.isBigEndian())
+      StOffset = TBits - ShiftBits - ActualModBits;
+    else
+      StOffset = ShiftBits;
+    if (StOffset % 8 != 0)
+      StoreWithoutLoad = false;
+    else
+      StOffset = StOffset / 8;
+
+    // If ActualModBits is not the length of legal type, we cannot
+    // store MaskedVal directly.
+    if (ActualModBits != 8 && ActualModBits != 16 && ActualModBits != 32)
+      StoreWithoutLoad = false;
+  }
+
+  unsigned Align = SI.getAlignment();
+  LLVMContext &Context = SI.getContext();
+  // If we are shrink the store of Val, update ActualModBits and ShiftBits
+  // to ensure the shrinked store is of legal type.
+  if (!StoreWithoutLoad) {
+    // If we cannot do StoreWithoutLoad shrink, do the simple shrink only
+    // when StoreTy is illegal.
+    if (TLI.isOperationLegalOrCustom(ISD::STORE, TLI.getValueType(DL, StoreTy)))
+      return false;
+    if (!ActualModBits)
+      return false;
+    updateShiftAndModifiedBits(ActualModBits, ShiftBits, TBits, Align, Context,
+                               DL, TLI);
+    if (ActualModBits >= TBits)
+      return false;
+
+    if (DL.isBigEndian())
+      StOffset = (TBits - ShiftBits - ActualModBits) / 8;
+    else
+      StOffset = ShiftBits / 8;
+  }
+
+  // Start shrinking the size of the store.
+  Value *NewPtr = Ptr;
+  unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  IRBuilder<> Builder(Context);
+  Builder.SetInsertPoint(&SI);
+  if (StOffset) {
+    ConstantInt *Idx = ConstantInt::get(Type::getInt32Ty(Context), StOffset);
+    NewPtr =
+        Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(Context, AS), "cast");
+    NewPtr =
+        Builder.CreateGEP(Type::getInt8Ty(Context), NewPtr, Idx, "uglygep");
+    Align = MinAlign(StOffset, Align);
+  }
+  Type *NewTy = Type::getIntNTy(Context, ActualModBits);
+  NewPtr = Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS), "cast");
+  APInt ModifiedCst = Cst->getValue().lshr(ShiftBits).trunc(ActualModBits);
+  ConstantInt *NewCst = ConstantInt::get(Context, ModifiedCst);
+
+  Value *NewVal;
+  if (OrAndPattern) {
+    // Shift and truncate MaskedVal.
+    Value *Trunc;
+    if (auto MVCst = dyn_cast<ConstantInt>(MaskedVal)) {
+      ModifiedCst = MVCst->getValue().lshr(ShiftBits).trunc(ActualModBits);
+      Trunc = ConstantInt::get(Context, ModifiedCst);
+    } else {
+      Value *ShiftedVal = ShiftBits
+                              ? Builder.CreateLShr(MaskedVal, ShiftBits, "lshr")
+                              : MaskedVal;
+      Trunc = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc");
+    }
+    // Create NewVal to store.
+    if (StoreWithoutLoad) {
+      NewVal = Trunc;
+    } else {
+      Value *NewLoad = Builder.CreateAlignedLoad(NewPtr, Align, "load.trunc");
+      Value *NewAnd = Builder.CreateAnd(NewLoad, NewCst, "and.trunc");
+      NewVal = Builder.CreateOr(NewAnd, Trunc, "or.trunc");
+    }
+  } else {
+    Value *NewLoad = Builder.CreateAlignedLoad(NewPtr, Align, "load.trunc");
+    // Create NewVal to store.
+    BinaryOperator *BOP = cast<BinaryOperator>(Val);
+    switch (BOP->getOpcode()) {
+    default:
+      break;
+    case Instruction::And:
+      NewVal = Builder.CreateAnd(NewLoad, NewCst, "and.trunc");
+      break;
+    case Instruction::Or:
+      NewVal = Builder.CreateOr(NewLoad, NewCst, "or.trunc");
+      break;
+    case Instruction::Xor:
+      NewVal = Builder.CreateXor(NewLoad, NewCst, "xor.trunc");
+      break;
+    }
+  }
+  // Create the new store and remove the old one.
+  Builder.CreateAlignedStore(NewVal, NewPtr, Align);
+  SI.eraseFromParent();
+  return true;
+}
+
 bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
@@ -5775,6 +6111,8 @@
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (TLI && splitMergedValStore(*SI, *DL, *TLI))
       return true;
+    if (TLI && reduceLoadOpsStoreWidth(*SI, *DL, *TLI))
+      return true;
     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     if (TLI) {
       unsigned AS = SI->getPointerAddressSpace();
Index: test/CodeGen/ARM/bitfield-store.ll
===================================================================
--- test/CodeGen/ARM/bitfield-store.ll
+++ test/CodeGen/ARM/bitfield-store.ll
@@ -0,0 +1,364 @@
+; RUN: opt < %s -mtriple=arm-eabi -codegenprepare -S | FileCheck %s
+; Check bitfield store is shrinked properly in cases below.
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %t0 = trunc i64 %conv to i16
+; CHECK: %bf.value = and i16 %t0, 255
+; CHECK: %trunc = trunc i16 %bf.value to i8
+; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %t0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %t0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16
+; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %bf.value = and i64 %conv, 4294967295
+; CHECK: %trunc = trunc i64 %bf.value to i32
+; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: %t0 = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %t0, 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: %bf.value = and i32 %n, 65535
+; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK: if.end:
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16 
+; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %bf.shl = shl i32 %bf.value, 16
+; CHECK: %lshr = lshr i32 %bf.shl, 16
+; CHECK: %trunc = trunc i32 %lshr to i16
+; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1 
+;
+define void @i24_or(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i8.
+; CHECK-LABEL: @i24_and(
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -7
+; CHECK: store i8 %and.trunc, i8* %uglygep, align 1
+;
+define void @i24_and(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = and i24 %aa, -1537
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_xor(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %xor.trunc = xor i16 %load.trunc, 384
+; CHECK: store i16 %xor.trunc, i16* %cast, align 1
+;
+define void @i24_xor(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = xor i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_and_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i24_and_or(i24* %a) {
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i24 store to i8.
+; CHECK-LABEL: @i24_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i24
+; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %lshr = lshr i24 %extbit.shl, 8
+; CHECK: %trunc = trunc i24 %lshr to i8
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { 
+  %extbit = zext i1 %bit to i24 
+  %b = load i24, i24* %a, align 1 
+  %extbit.shl = shl nuw nsw i24 %extbit, 13 
+  %c = and i24 %b, -8193 
+  %d = or i24 %c, %extbit.shl 
+  store i24 %d, i24* %a, align 1 
+  ret void 
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or(
+; CHECK: %cast = bitcast i56* %a to i32*
+; CHECK: %load.trunc = load i32, i32* %cast, align 1
+; CHECK: %or.trunc = or i32 %load.trunc, 384
+; CHECK: store i32 %or.trunc, i32* %cast, align 1
+;
+define void @i56_or(i56* %a) {
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_and_or(
+; CHECK: %cast = bitcast i56* %a to i32*
+; CHECK: %load.trunc = load i32, i32* %cast, align 1
+; CHECK: %and.trunc = and i32 %load.trunc, -128
+; CHECK: %or.trunc = or i32 %and.trunc, 384
+; CHECK: store i32 %or.trunc, i32* %cast, align 1
+;
+define void @i56_and_or(i56* %a) {
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store to i8.
+; CHECK-LABEL: @i56_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i56
+; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %lshr = lshr i56 %extbit.shl, 8
+; CHECK: %trunc = trunc i56 %lshr to i32
+; CHECK: %load.trunc = load i32, i32* %cast1, align 1
+; CHECK: %and.trunc = and i32 %load.trunc, -33
+; CHECK: %or.trunc = or i32 %and.trunc, %trunc
+; CHECK: store i32 %or.trunc, i32* %cast1, align 1
+;
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or_alg2(
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i32*
+; CHECK: %load.trunc = load i32, i32* %cast1, align 2
+; CHECK: %or.trunc = or i32 %load.trunc, 272
+; CHECK: store i32 %or.trunc, i32* %cast1, align 2
+;
+define void @i56_or_alg2(i56* %a) {
+  %aa = load i56, i56* %a, align 2
+  %b = or i56 %aa, 17825792
+  store i56 %b, i56* %a, align 2
+  ret void
+}
+
+
Index: test/CodeGen/ARM/illegal-bitfield-loadstore.ll
===================================================================
--- test/CodeGen/ARM/illegal-bitfield-loadstore.ll
+++ test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -12,13 +12,9 @@
 ;
 ; BE-LABEL: i24_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r1, [r0]
-; BE-NEXT:    ldrb r2, [r0, #2]
-; BE-NEXT:    orr r1, r2, r1, lsl #8
+; BE-NEXT:    ldrh r1, [r0, #1]
 ; BE-NEXT:    orr r1, r1, #384
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    strh r1, [r0, #1]
 ; BE-NEXT:    mov pc, lr
   %aa = load i24, i24* %a, align 1
   %b = or i24 %aa, 384
@@ -29,36 +25,23 @@
 define void @i24_and_or(i24* %a) {
 ; LE-LABEL: i24_and_or:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldrb r1, [r0, #2]
-; LE-NEXT:    ldrh r2, [r0]
-; LE-NEXT:    orr r1, r2, r1, lsl #16
-; LE-NEXT:    ldr r2, .LCPI1_0
+; LE-NEXT:    ldrh r1, [r0]
+; LE-NEXT:    mov r2, #16256
+; LE-NEXT:    orr r2, r2, #49152
 ; LE-NEXT:    orr r1, r1, #384
 ; LE-NEXT:    and r1, r1, r2
 ; LE-NEXT:    strh r1, [r0]
-; LE-NEXT:    lsr r1, r1, #16
-; LE-NEXT:    strb r1, [r0, #2]
 ; LE-NEXT:    mov pc, lr
-; LE-NEXT:    .p2align 2
-; LE-NEXT:  @ BB#1:
-; LE-NEXT:  .LCPI1_0:
-; LE-NEXT:    .long 16777088 @ 0xffff80
 ;
 ; BE-LABEL: i24_and_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r1, [r0]
-; BE-NEXT:    mov r2, #384
-; BE-NEXT:    orr r1, r2, r1, lsl #8
-; BE-NEXT:    ldr r2, .LCPI1_0
+; BE-NEXT:    ldrh r1, [r0, #1]
+; BE-NEXT:    mov r2, #16256
+; BE-NEXT:    orr r2, r2, #49152
+; BE-NEXT:    orr r1, r1, #384
 ; BE-NEXT:    and r1, r1, r2
-; BE-NEXT:    strb r1, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
-; BE-NEXT:    mov pc, lr
-; BE-NEXT:    .p2align 2
-; BE-NEXT:  @ BB#1:
-; BE-NEXT:  .LCPI1_0:
-; BE-NEXT:    .long 16777088 @ 0xffff80
+; BE-NEXT:    strh r1, [r0, #1]
+; BE-NEXT:    mov pc, lr
   %b = load i24, i24* %a, align 1
   %c = and i24 %b, -128
   %d = or i24 %c, 384
@@ -69,37 +52,19 @@
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; LE-LABEL: i24_insert_bit:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldrb r2, [r0, #2]
-; LE-NEXT:    ldrh r3, [r0]
-; LE-NEXT:    orr r2, r3, r2, lsl #16
-; LE-NEXT:    ldr r3, .LCPI2_0
-; LE-NEXT:    and r2, r2, r3
-; LE-NEXT:    lsr r3, r2, #16
-; LE-NEXT:    orr r1, r2, r1, lsl #13
-; LE-NEXT:    strb r3, [r0, #2]
-; LE-NEXT:    strh r1, [r0]
+; LE-NEXT:    ldrb r2, [r0, #1]
+; LE-NEXT:    and r2, r2, #223
+; LE-NEXT:    orr r1, r2, r1, lsl #5
+; LE-NEXT:    strb r1, [r0, #1]
 ; LE-NEXT:    mov pc, lr
-; LE-NEXT:    .p2align 2
-; LE-NEXT:  @ BB#1:
-; LE-NEXT:  .LCPI2_0:
-; LE-NEXT:    .long 16769023 @ 0xffdfff
 ;
 ; BE-LABEL: i24_insert_bit:
 ; BE:       @ BB#0:
-; BE-NEXT:    ldrh r2, [r0]
-; BE-NEXT:    ldrb r3, [r0, #2]
-; BE-NEXT:    orr r2, r3, r2, lsl #8
-; BE-NEXT:    ldr r3, .LCPI2_0
-; BE-NEXT:    and r2, r2, r3
-; BE-NEXT:    orr r1, r2, r1, lsl #13
-; BE-NEXT:    strb r2, [r0, #2]
-; BE-NEXT:    lsr r1, r1, #8
-; BE-NEXT:    strh r1, [r0]
-; BE-NEXT:    mov pc, lr
-; BE-NEXT:    .p2align 2
-; BE-NEXT:  @ BB#1:
-; BE-NEXT:  .LCPI2_0:
-; BE-NEXT:    .long 16769023 @ 0xffdfff
+; BE-NEXT:    ldrb r2, [r0, #1]
+; BE-NEXT:    and r2, r2, #223
+; BE-NEXT:    orr r1, r2, r1, lsl #5
+; BE-NEXT:    strb r1, [r0, #1]
+; BE-NEXT:    mov pc, lr
   %extbit = zext i1 %bit to i24
   %b = load i24, i24* %a, align 1
   %extbit.shl = shl nuw nsw i24 %extbit, 13
@@ -119,19 +84,9 @@
 ;
 ; BE-LABEL: i56_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, r0
-; BE-NEXT:    ldr r12, [r0]
-; BE-NEXT:    ldrh r2, [r1, #4]!
-; BE-NEXT:    ldrb r3, [r1, #2]
-; BE-NEXT:    orr r2, r3, r2, lsl #8
-; BE-NEXT:    orr r2, r2, r12, lsl #24
-; BE-NEXT:    orr r2, r2, #384
-; BE-NEXT:    lsr r3, r2, #8
-; BE-NEXT:    strb r2, [r1, #2]
-; BE-NEXT:    strh r3, [r1]
-; BE-NEXT:    bic r1, r12, #255
-; BE-NEXT:    orr r1, r1, r2, lsr #24
-; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    ldr r1, [r0, #3]
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    str r1, [r0, #3]
 ; BE-NEXT:    mov pc, lr
   %aa = load i56, i56* %a
   %b = or i56 %aa, 384
@@ -150,19 +105,10 @@
 ;
 ; BE-LABEL: i56_and_or:
 ; BE:       @ BB#0:
-; BE-NEXT:    mov r1, r0
-; BE-NEXT:    mov r3, #128
-; BE-NEXT:    ldrh r2, [r1, #4]!
-; BE-NEXT:    strb r3, [r1, #2]
-; BE-NEXT:    lsl r2, r2, #8
-; BE-NEXT:    ldr r12, [r0]
-; BE-NEXT:    orr r2, r2, r12, lsl #24
-; BE-NEXT:    orr r2, r2, #384
-; BE-NEXT:    lsr r3, r2, #8
-; BE-NEXT:    strh r3, [r1]
-; BE-NEXT:    bic r1, r12, #255
-; BE-NEXT:    orr r1, r1, r2, lsr #24
-; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    ldr r1, [r0, #3]
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    bic r1, r1, #127
+; BE-NEXT:    str r1, [r0, #3]
 ; BE-NEXT:    mov pc, lr
 
   %b = load i56, i56* %a, align 1
@@ -175,31 +121,18 @@
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; LE-LABEL: i56_insert_bit:
 ; LE:       @ BB#0:
-; LE-NEXT:    ldr r2, [r0]
-; LE-NEXT:    bic r2, r2, #8192
-; LE-NEXT:    orr r1, r2, r1, lsl #13
-; LE-NEXT:    str r1, [r0]
+; LE-NEXT:    ldr r2, [r0, #1]
+; LE-NEXT:    bic r2, r2, #32
+; LE-NEXT:    orr r1, r2, r1, lsl #5
+; LE-NEXT:    str r1, [r0, #1]
 ; LE-NEXT:    mov pc, lr
 ;
 ; BE-LABEL: i56_insert_bit:
 ; BE:       @ BB#0:
-; BE-NEXT:    .save {r11, lr}
-; BE-NEXT:    push {r11, lr}
-; BE-NEXT:    mov r2, r0
-; BE-NEXT:    ldr lr, [r0]
-; BE-NEXT:    ldrh r12, [r2, #4]!
-; BE-NEXT:    ldrb r3, [r2, #2]
-; BE-NEXT:    orr r12, r3, r12, lsl #8
-; BE-NEXT:    orr r3, r12, lr, lsl #24
-; BE-NEXT:    bic r3, r3, #8192
-; BE-NEXT:    orr r1, r3, r1, lsl #13
-; BE-NEXT:    strb r3, [r2, #2]
-; BE-NEXT:    lsr r3, r1, #8
-; BE-NEXT:    strh r3, [r2]
-; BE-NEXT:    bic r2, lr, #255
-; BE-NEXT:    orr r1, r2, r1, lsr #24
-; BE-NEXT:    str r1, [r0]
-; BE-NEXT:    pop {r11, lr}
+; BE-NEXT:    ldr r2, [r0, #2]
+; BE-NEXT:    bic r2, r2, #32
+; BE-NEXT:    orr r1, r2, r1, lsl #5
+; BE-NEXT:    str r1, [r0, #2]
 ; BE-NEXT:    mov pc, lr
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1
Index: test/CodeGen/X86/bitfield-store.ll
===================================================================
--- test/CodeGen/X86/bitfield-store.ll
+++ test/CodeGen/X86/bitfield-store.ll
@@ -0,0 +1,363 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -codegenprepare -S | FileCheck %s
+; Check bitfield store is shrinked properly in cases below.
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %t0 = trunc i64 %conv to i16
+; CHECK: %bf.value = and i16 %t0, 255
+; CHECK: %trunc = trunc i16 %bf.value to i8
+; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %t0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %t0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16
+; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK: %conv = zext i32 %n to i64
+; CHECK: %bf.value = and i64 %conv, 4294967295
+; CHECK: %trunc = trunc i64 %bf.value to i32
+; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: %t0 = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %t0, 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: %bf.value = and i32 %n, 65535
+; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK: if.end:
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %trunc = trunc i32 %bf.value to i16 
+; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK: %bf.value = and i32 %n, 65535
+; CHECK: %bf.shl = shl i32 %bf.value, 16
+; CHECK: %lshr = lshr i32 %bf.shl, 16
+; CHECK: %trunc = trunc i32 %lshr to i16
+; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1 
+;
+define void @i24_or(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i8.
+; CHECK-LABEL: @i24_and(
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -7
+; CHECK: store i8 %and.trunc, i8* %uglygep, align 1
+;
+define void @i24_and(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = and i24 %aa, -1537
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_xor(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %xor.trunc = xor i16 %load.trunc, 384
+; CHECK: store i16 %xor.trunc, i16* %cast, align 1
+;
+define void @i24_xor(i24* %a) {
+  %aa = load i24, i24* %a, align 1
+  %b = xor i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i24 store
+; to i16.
+; CHECK-LABEL: @i24_and_or(
+; CHECK: %cast = bitcast i24* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i24_and_or(i24* %a) {
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i24 store to i8.
+; CHECK-LABEL: @i24_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i24
+; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13
+; CHECK: %cast = bitcast i24* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %lshr = lshr i24 %extbit.shl, 8
+; CHECK: %trunc = trunc i24 %lshr to i8
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { 
+  %extbit = zext i1 %bit to i24 
+  %b = load i24, i24* %a, align 1 
+  %extbit.shl = shl nuw nsw i24 %extbit, 13 
+  %c = and i24 %b, -8193 
+  %d = or i24 %c, %extbit.shl 
+  store i24 %d, i24* %a, align 1 
+  ret void 
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or(
+; CHECK: %cast = bitcast i56* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %or.trunc = or i16 %load.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i56_or(i56* %a) {
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_and_or(
+; CHECK: %cast = bitcast i56* %a to i16*
+; CHECK: %load.trunc = load i16, i16* %cast, align 1
+; CHECK: %and.trunc = and i16 %load.trunc, -128
+; CHECK: %or.trunc = or i16 %and.trunc, 384
+; CHECK: store i16 %or.trunc, i16* %cast, align 1
+;
+define void @i56_and_or(i56* %a) {
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can shrink the i56 store to i8.
+; CHECK-LABEL: @i56_insert_bit(
+; CHECK: %extbit = zext i1 %bit to i56
+; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1
+; CHECK: %lshr = lshr i56 %extbit.shl, 8
+; CHECK: %trunc = trunc i56 %lshr to i8
+; CHECK: %load.trunc = load i8, i8* %uglygep, align 1
+; CHECK: %and.trunc = and i8 %load.trunc, -33
+; CHECK: %or.trunc = or i8 %and.trunc, %trunc
+; CHECK: store i8 %or.trunc, i8* %uglygep, align 1
+;
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+; Cannot remove the load and bit operations, but can still shrink the i56 store
+; to i16.
+; CHECK-LABEL: @i56_or_alg2(
+; CHECK: %cast = bitcast i56* %a to i8*
+; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2
+; CHECK: %cast1 = bitcast i8* %uglygep to i16*
+; CHECK: %load.trunc = load i16, i16* %cast1, align 2
+; CHECK: %or.trunc = or i16 %load.trunc, 272
+; CHECK: store i16 %or.trunc, i16* %cast1, align 2
+;
+define void @i56_or_alg2(i56* %a) {
+  %aa = load i56, i56* %a, align 2
+  %b = or i56 %aa, 17825792
+  store i56 %b, i56* %a, align 2
+  ret void
+}
+
+
Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll
===================================================================
--- test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -4,13 +4,7 @@
 define void @i24_or(i24* %a) {
 ; CHECK-LABEL: i24_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    movb %cl, 2(%rdi)
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
-; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; CHECK-NEXT:    retq
   %aa = load i24, i24* %a, align 1
   %b = or i24 %aa, 384
@@ -22,14 +16,9 @@
 ; CHECK-LABEL: i24_and_or:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
-; CHECK-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
-; CHECK-NEXT:    movw %cx, (%rdi)
-; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    movb %cl, 2(%rdi)
+; CHECK-NEXT:    orl $384, %eax # imm = 0x180
+; CHECK-NEXT:    andl $65408, %eax # imm = 0xFF80
+; CHECK-NEXT:    movw %ax, (%rdi)
 ; CHECK-NEXT:    retq
   %b = load i24, i24* %a, align 1
   %c = and i24 %b, -128
@@ -41,17 +30,11 @@
 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i24_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl (%rdi), %ecx
-; CHECK-NEXT:    movzbl 2(%rdi), %edx
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shll $13, %eax
-; CHECK-NEXT:    andl $16769023, %edx # imm = 0xFFDFFF
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    shrl $16, %edx
-; CHECK-NEXT:    movb %dl, 2(%rdi)
-; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    shlb $5, %sil
+; CHECK-NEXT:    movb 1(%rdi), %al
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    movb %al, 1(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i24
   %b = load i24, i24* %a, align 1
@@ -65,19 +48,7 @@
 define void @i56_or(i56* %a) {
 ; CHECK-LABEL: i56_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl 4(%rdi), %eax
-; CHECK-NEXT:    movzbl 6(%rdi), %ecx
-; CHECK-NEXT:    movl (%rdi), %edx
-; CHECK-NEXT:    movb %cl, 6(%rdi)
-; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    orq $384, %rdx # imm = 0x180
-; CHECK-NEXT:    movl %edx, (%rdi)
-; CHECK-NEXT:    shrq $32, %rdx
-; CHECK-NEXT:    movw %dx, 4(%rdi)
+; CHECK-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; CHECK-NEXT:    retq
   %aa = load i56, i56* %a, align 1
   %b = or i56 %aa, 384
@@ -88,22 +59,10 @@
 define void @i56_and_or(i56* %a) {
 ; CHECK-LABEL: i56_and_or:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl 4(%rdi), %eax
-; CHECK-NEXT:    movzbl 6(%rdi), %ecx
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    orq %rcx, %rax
-; CHECK-NEXT:    orq $384, %rax # imm = 0x180
-; CHECK-NEXT:    movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80
-; CHECK-NEXT:    andq %rax, %rcx
-; CHECK-NEXT:    movl %ecx, (%rdi)
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movw %ax, 4(%rdi)
-; CHECK-NEXT:    shrq $48, %rcx
-; CHECK-NEXT:    movb %cl, 6(%rdi)
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    orl $384, %eax # imm = 0x180
+; CHECK-NEXT:    andl $65408, %eax # imm = 0xFF80
+; CHECK-NEXT:    movw %ax, (%rdi)
 ; CHECK-NEXT:    retq
   %b = load i56, i56* %a, align 1
   %c = and i56 %b, -128
@@ -115,23 +74,11 @@
 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
 ; CHECK-LABEL: i56_insert_bit:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    movzwl 4(%rdi), %ecx
-; CHECK-NEXT:    movzbl 6(%rdi), %edx
-; CHECK-NEXT:    shll $16, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    shlq $32, %rdx
-; CHECK-NEXT:    movl (%rdi), %ecx
-; CHECK-NEXT:    orq %rdx, %rcx
-; CHECK-NEXT:    shlq $13, %rax
-; CHECK-NEXT:    movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rdx, %ra
-; CHECK-NEXT:    movl %eax, (%rdi)
-; CHECK-NEXT:    shrq $48, %rdx
-; CHECK-NEXT:    movb %dl, 6(%rdi)
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movw %ax, 4(%rdi)
+; CHECK-NEXT:    shlb $5, %sil
+; CHECK-NEXT:    movb 1(%rdi), %al
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    movb %al, 1(%rdi)
 ; CHECK-NEXT:    retq
   %extbit = zext i1 %bit to i56
   %b = load i56, i56* %a, align 1