Index: include/llvm/IR/PatternMatch.h
===================================================================
--- include/llvm/IR/PatternMatch.h
+++ include/llvm/IR/PatternMatch.h
@@ -304,6 +304,9 @@
 /// \brief Match a ConstantInt, capturing the value if we match.
 inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 
+/// \brief Match a load instruction, capturing the value if we match.
+inline bind_ty<LoadInst> m_Load(LoadInst *&LI) { return LI; }
+
 /// \brief Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -22,9 +22,11 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
@@ -1293,6 +1295,110 @@
   return false;
 }
 
+// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
+// is a byte mask indicating a consecutive number of bytes, check to see if
+// Y is known to provide just those bytes.  If so, what the original store
+// sequence is doing is "load P to V; replace some bytes in V with corresponding
+// bytes from Y via bit manipulation; then store the updated V to P". We can
+// replace the sequence with a single (narrower) store, and save the load and
+// bit operations.
+static bool reduceLoadOpStoreWidth(InstCombiner &IC, StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+  Type *StoreTy = Val->getType();
+  if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy() || !Val->hasOneUse())
+    return false;
+
+  const DataLayout &DL = IC.getDataLayout();
+  unsigned TBits = DL.getTypeSizeInBits(StoreTy);
+  if (TBits != DL.getTypeStoreSizeInBits(StoreTy))
+    return false;
+
+  LoadInst *Load;
+  Value *MaskedVal;
+  ConstantInt *Cst;
+  // Match "or((and (load P), cst), Y)" or "or(Y, (and (load P), cst))".
+  if (!match(Val, m_c_Or(m_And(m_Load(Load), m_ConstantInt(Cst)),
+                         m_Value(MaskedVal))))
+    return false;
+
+  // Load should have the same address as SI.
+  if (Load->getOperand(0) != Ptr)
+    return false;
+
+  // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We
+  // will handle patterns like either 0..01..1 or 1..10..01..1
+  uint64_t Mask = Cst->getValue().getSExtValue();
+  unsigned MaskLeadOnes = countLeadingOnes(Mask);
+  unsigned MaskTrailOnes = countTrailingOnes(Mask);
+  unsigned MaskMidZeros = !MaskLeadOnes
+                              ? countLeadingZeros(Mask)
+                              : countTrailingZeros(Mask >> MaskTrailOnes);
+
+  // See if we have a continuous run of zeros.
+  if (MaskMidZeros == 0 || MaskLeadOnes + MaskMidZeros + MaskTrailOnes != 64)
+    return false;
+
+  // Adjust EffectiveZeros down to be from the actual size of store instead
+  // of i64.
+  unsigned EffectiveZeros = MaskMidZeros;
+  if (MaskTrailOnes + MaskMidZeros > TBits)
+    EffectiveZeros = MaskMidZeros - (64 - TBits);
+
+  // Only handle 1B/2B/4B size of EffectiveZeros.
+  if (EffectiveZeros != 8 && EffectiveZeros != 16 && EffectiveZeros != 32)
+    return false;
+
+  if (MaskTrailOnes % EffectiveZeros)
+    return false;
+
+  // Check MaskedVal only provides nonzero bits within range from lowbits
+  // (MaskTrailOnes) to highbits (MaskTrailOnes + EffectiveZeros).
+  APInt BitMask =
+      ~APInt::getBitsSet(TBits, MaskTrailOnes, MaskTrailOnes + EffectiveZeros);
+  if (!IC.MaskedValueIsZero(MaskedVal, BitMask))
+    return false;
+
+  // Start shrinking the size of the store.
+  // Get the offset from Ptr for the shrinked store.
+  unsigned StOffset;
+  if (DL.isBigEndian()) {
+    if ((TBits - MaskTrailOnes - EffectiveZeros) % 8 != 0)
+      return false;
+    StOffset = (TBits - MaskTrailOnes - EffectiveZeros) / 8;
+  } else {
+    if (MaskTrailOnes % 8 != 0)
+      return false;
+    StOffset = MaskTrailOnes / 8;
+  }
+
+  Value *NewPtr = Ptr;
+  unsigned Align = SI.getAlignment();
+  Type *NewTy = Type::getIntNTy(StoreTy->getContext(), EffectiveZeros);
+  InstCombiner::BuilderTy &Builder = *IC.Builder;
+  unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  // Generate the new address and alignment for the store:
+  if (StOffset) {
+    ConstantInt *Idx =
+        ConstantInt::get(Type::getInt32Ty(SI.getContext()), StOffset);
+    NewPtr =
+        Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(SI.getContext(), AS));
+    NewPtr = Builder.CreateGEP(Type::getInt8Ty(SI.getContext()), NewPtr, Idx,
+                               "uglygep");
+    Align = DL.getABITypeAlignment(NewTy);
+  }
+  NewPtr = Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS));
+
+  // Prepare the value to store: Shift MaskTrailOnes bits right and truncate
+  // the MaskedVal.
+  Value *LShr = Builder.CreateLShr(MaskedVal, MaskTrailOnes);
+  Value *Trunc = Builder.CreateTruncOrBitCast(LShr, NewTy);
+  // Create the new store and remove the old one.
+  Builder.CreateAlignedStore(Trunc, NewPtr, Align);
+  SI.eraseFromParent();
+  return true;
+}
+
 Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   Value *Val = SI.getOperand(0);
   Value *Ptr = SI.getOperand(1);
@@ -1413,6 +1519,8 @@
       if (SimplifyStoreAtEndOfBlock(SI))
         return nullptr;  // xform done!
 
+  if (reduceLoadOpStoreWidth(*this, SI))
+    return nullptr;
   return nullptr;
 }
 
Index: test/Transforms/InstCombine/bitfield-store.ll
===================================================================
--- test/Transforms/InstCombine/bitfield-store.ll
+++ test/Transforms/InstCombine/bitfield-store.ll
@@ -0,0 +1,213 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; class A1 {
+;   unsigned long f1:8;
+;   unsigned long f2:3;
+; } a1;
+; a1.f1 = n;
+;
+; The bitfield store can be shrinked from i16 to i8.
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %[[TRUNC:.*]] = trunc i32 %n to i8
+; CHECK-NEXT: store i8 %[[TRUNC]], i8* bitcast (%class.A1* @a1 to i8*)
+; CHECK-NEXT: ret void
+
+%class.A1 = type { i16, [6 x i8] }
+@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8
+
+define void @test1(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %0 = trunc i64 %conv to i16
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  %bf.value = and i16 %0, 255
+  %bf.clear = and i16 %bf.load, -256
+  %bf.set = or i16 %bf.clear, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8
+  ret void
+}
+
+; class A2 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a2;
+; a2.f1 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %[[TRUNC:.*]] = trunc i32 %n to i16
+; CHECK-NEXT: store i16 %[[TRUNC]], i16* bitcast (%class.A2* @a2 to i16*)
+; CHECK-NEXT: ret void
+
+%class.A2 = type { i24, [4 x i8] }
+@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8
+
+define void @test2(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.clear = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8
+  ret void
+}
+
+; class A3 {
+;   unsigned long f1:32;
+;   unsigned long f2:3;
+; } a3;
+; a3.f1 = n;
+; The bitfield store can be shrinked from i64 to i32.
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store i32 %n, i32* bitcast (%class.A3* @a3 to i32*)
+; CHECK-NEXT: ret void
+
+%class.A3 = type { i40 }
+@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8
+
+define void @test3(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  %bf.value = and i64 %conv, 4294967295
+  %bf.clear = and i64 %bf.load, -4294967296
+  %bf.set = or i64 %bf.clear, %bf.value
+  store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8
+  ret void
+}
+
+; class A4 {
+;   unsigned long f1:13;
+;   unsigned long f2:3;
+; } a4;
+; a4.f1 = n;
+; The bitfield store cannot be shrinked because the field is not 8/16/32 bits.
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0)
+; CHECK-NEXT: %[[TRUNC:.*]] = trunc i32 %n to i16
+; CHECK-NEXT: %bf.value = and i16 %[[TRUNC]], 8191
+; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192
+; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value
+; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0)
+; CHECK-NEXT: ret void
+
+%class.A4 = type { i16, [6 x i8] }
+@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8
+
+define void @test4(i32 %n) {
+entry:
+  %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  %t0 = trunc i32 %n to i16
+  %bf.value = and i16 %t0, 8191
+  %bf.clear3 = and i16 %bf.load, -8192
+  %bf.set = or i16 %bf.clear3, %bf.value
+  store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8
+  ret void
+}
+
+; class A5 {
+;   unsigned long f1:3;
+;   unsigned long f2:16;
+; } a5;
+; a5.f2 = n;
+; The bitfield store cannot be shrinked because it is not aligned on
+; 16bits boundary.
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*)
+; CHECK-NEXT: %bf.value = shl i32 %n, 3
+; CHECK-NEXT: and i32 %bf.value, 524280
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281
+; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl
+; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*)
+; CHECK-NEXT: ret void
+
+%class.A5 = type { i24, [4 x i8] }
+@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8
+
+define void @test5(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 3
+  %bf.clear = and i32 %bf.load, -524281
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8
+  ret void
+}
+
+; class A6 {
+;   unsigned long f1:16;
+;   unsigned long f2:3;
+; } a6;
+; a6.f1 = n;
+; The bitfield store can be shrinked from i32 to i16 even the load and store
+; are in different BasicBlocks.
+; CHECK-LABEL: @test6(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*)
+; CHECK-NEXT: %bf.clear = and i32 %bf.load, 65535
+; CHECK-NEXT: %cmp = icmp eq i32 %bf.clear, 2
+; CHECK-NEXT: br i1 %cmp, label %return, label %if.end
+; CHECK: if.end:
+; CHECK-NEXT: %0 = trunc i32 %n to i16
+; CHECK-NEXT: store i16 %0, i16* bitcast (%class.A6* @a6 to i16*)
+; CHECK-NEXT: br label %return
+
+%class.A6 = type { i24, [4 x i8] }
+@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8
+
+define void @test6(i32 %n) {
+entry:
+  %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.cast = zext i32 %bf.clear to i64
+  %cmp = icmp eq i32 %bf.clear, 2
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %conv1 = zext i32 %n to i64
+  %bf.value = and i32 %n, 65535
+  %bf.clear3 = and i32 %bf.load, -65536
+  %bf.set = or i32 %bf.clear3, %bf.value
+  store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; class A7 {
+;   unsigned long f1:16;
+;   unsigned long f2:16;
+; } a7;
+; a7.f2 = n;
+; The bitfield store can be shrinked from i32 to i16.
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %[[TRUNC:.*]] = trunc i32 %n to i16
+; CHECK-NEXT: store i16 %[[TRUNC]], i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i64 2) to i16*)
+; CHECK-NEXT: ret void
+
+%class.A7 = type { i32, [4 x i8] }
+@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8
+
+define void @test7(i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  %bf.value = and i32 %n, 65535
+  %bf.shl = shl i32 %bf.value, 16
+  %bf.clear = and i32 %bf.load, 65535
+  %bf.set = or i32 %bf.clear, %bf.shl
+  store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8
+  ret void
+}
+