Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -329,6 +329,10 @@
     return false;
   }
 
+  /// \brief Return true if it is cheaper to split the store of a merged int val
+  /// from a pair of smaller values into multiple stores.
+  virtual bool isMultiStoresCheaperThanBitsMerge() const { return false; }
+
   /// \brief Return if the target supports combining a
   /// chain like:
   /// \code
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -374,6 +374,7 @@
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
@@ -12155,9 +12156,122 @@
       return NewSt;
   }
 
+  if (TLI.isMultiStoresCheaperThanBitsMerge()) {
+    if (SDValue NewSt = splitMergedValStore(ST))
+      return NewSt;
+  }
+
   return ReduceLoadOpStoreWidth(N);
 }
 
+/// For the instruction sequence of store below, i32_tmp and float_tmp
+/// are bundled together as an i64 data before stored into memory. If the
+/// i64 data is not used outside of the store, it is more efficent to
+/// generate separate stores for i32_tmp and float_tmp.
+///
+/// Instruction sequence of i64 Store:
+///         t1: i32 = bitcast float_tmp
+///       t2: i64 = zero_extend t1
+///         t3: i64 = zero_extend i32_tmp
+///       t4: i64 = shl t3, Constant:i8<32>
+///     t5: i64 = or i64 t2, t4
+///   t6: ch = store<ST8[addr]> t0, t5, FrameIndex:i64<0>, undef:i64
+///
+/// Instruction sequence of splitted i32 stores:
+///     t1: i32 = bitcast float_tmp
+///   t2: ch = store<ST4[addr](align=8)> t0, t1, FrameIndex:i64<0>, undef:i64
+///     t3: i64 = add FrameIndex:i64<0>, Constant:i64<4>
+///   t4: ch = store<ST4[addr+4]> t0, i32_tmp, t3, undef:i64
+///
+/// Similarly,
+/// {i32, i32} pair store can be converted from i64 store to two i32 stores.
+/// {i32, i16} pair store can be converted from i64 store to two i32 stores.
+/// {i16, i16} pair store can be converted from i32 store to two i16 stores.
+/// {i16, i8} pair store can be converted from i32 store to two i16 stores.
+/// {i8, i8} pair store can be converted from i16 store to two i8 stores.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  SDValue Val = ST->getValue();
+  SDLoc DL(ST);
+
+  // Match OR operand.
+  if (!Val.getValueType().isScalarInteger() || !Val.getNode()->hasOneUse() ||
+      Val.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // Match SHL operand and get Lower and Higher parts of Val.
+  SDValue Op1 = Val.getOperand(0);
+  SDValue Op2 = Val.getOperand(1);
+  SDValue Shl, Lo, Hi;
+  if (Op1.getOpcode() == ISD::SHL) {
+    Shl = Op1;
+    Lo = Op2;
+    Hi = Shl.getOperand(0);
+  } else if (Op2.getOpcode() == ISD::SHL) {
+    Shl = Op2;
+    Lo = Op1;
+    Hi = Shl.getOperand(0);
+  } else {
+    return SDValue();
+  }
+
+  if (!Shl.hasOneUse())
+    return SDValue();
+
+  // Match SHL amount to HalfValBitSize.
+  unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2;
+  ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
+  if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
+    return SDValue();
+
+  // Lo and Hi are zero-extended from int with size less equal than 32
+  // to i64.
+  if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
+      !Lo.getOperand(0).getValueType().isScalarInteger() ||
+      Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize ||
+      Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
+      !Hi.getOperand(0).getValueType().isScalarInteger() ||
+      Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize)
+    return SDValue();
+
+  // Start to split store.
+  unsigned Alignment = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+
+  // Change the sizes of Lo and Hi's value types to HalfValBitSize.
+  EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  // Lower value store.
+  SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                             ST->getAlignment(), MMOFlags, AAInfo);
+  Ptr =
+      DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                  DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
+  // Higher value store.
+  SDValue St1 =
+      DAG.getStore(Chain, DL, Hi, Ptr,
+                   ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+                   Alignment / 2, MMOFlags, AAInfo);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1);
+}
+
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -764,6 +764,8 @@
       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
     }
 
+    bool isMultiStoresCheaperThanBitsMerge() const override { return true; }
+
     bool hasAndNotCompare(SDValue Y) const override;
 
     /// Return the value type to use for ISD::SETCC.
Index: test/Transforms/InstCombine/split-store.ll
===================================================================
--- test/Transforms/InstCombine/split-store.ll
+++ test/Transforms/InstCombine/split-store.ll
@@ -0,0 +1,193 @@
+; RUN: llc < %s | FileCheck %s
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+declare void @goo1(%"pair1"* dereferenceable(8)) local_unnamed_addr
+%"pair1" = type { i32, float }
+
+; CHECK-LABEL: int32_float_pair
+; CHECK: movss %xmm0, 4(%rsp)
+; CHECK: movl %edi, (%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int32_float_pair(i32 %tmp1, float %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i64, align 8
+  %tmpcast = bitcast i64* %ref.tmp to %"pair1"*
+  %t0 = bitcast i64* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %t1 = bitcast float %tmp2 to i32
+  %retval.sroa.2.0.insert.ext.i = zext i32 %t1 to i64
+  %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8
+  call void @goo1(%"pair1"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}
+
+declare void @goo2(%"pair2"* dereferenceable(8)) local_unnamed_addr
+%"pair2" = type { float, i32 }
+
+; CHECK-LABEL: float_int32_pair
+; CHECK: movl %edi, 4(%rsp)
+; CHECK: movss %xmm0, (%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @float_int32_pair(float %tmp1, i32 %tmp2) local_unnamed_addr #0 {
+entry:
+  %ref.tmp = alloca i64, align 8
+  %tmpcast = bitcast i64* %ref.tmp to %"pair2"*
+  %t0 = bitcast i64* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0) #5
+  %t1 = bitcast float %tmp1 to i32
+  %retval.sroa.2.0.insert.ext.i = zext i32 %tmp2 to i64
+  %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %t1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8
+  call void @goo2(%"pair2"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0) #5
+  ret void
+}
+
+declare void @goo3(%"pair3"* dereferenceable(8)) local_unnamed_addr
+%"pair3" = type { i32, i32 }
+
+; CHECK-LABEL: int32_int32_pair
+; CHECK: movl %esi, 4(%rsp) 
+; CHECK: movl %edi, (%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int32_int32_pair(i32 %tmp1, i32 %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i64, align 8
+  %tmpcast = bitcast i64* %ref.tmp to %"pair3"*
+  %t0 = bitcast i64* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %retval.sroa.2.0.insert.ext.i = zext i32 %tmp2 to i64
+  %retval.sroa.2.0.insert.shift.i = shl nuw i64 %retval.sroa.2.0.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8
+  call void @goo3(%"pair3"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}
+
+declare void @goo4(%"pair4"* dereferenceable(8)) local_unnamed_addr
+%"pair4" = type { i32, i16 }
+
+; CHECK-LABEL: int32_int16_pair
+; CHECK: movl %edi, (%rsp) 
+; CHECK: movzwl	%si, %eax
+; CHECK: movl %eax, 4(%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int32_int16_pair(i32 %tmp1, i16 signext %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i64, align 8
+  %tmpcast = bitcast i64* %ref.tmp to %"pair4"*
+  %t0 = bitcast i64* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %retval.sroa.2.0.insert.ext.i = zext i16 %tmp2 to i64
+  %retval.sroa.2.0.insert.shift.i = shl nuw nsw i64 %retval.sroa.2.0.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8
+  call void @goo4(%"pair4"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}
+
+declare void @goo5(%"pair5"* dereferenceable(8)) local_unnamed_addr
+%"pair5" = type { i32, i8 }
+
+; CHECK-LABEL: int32_int8_pair
+; CHECK: movl %edi, (%rsp)
+; CHECK: movzbl	%sil, %eax
+; CHECK: movl %eax, 4(%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int32_int8_pair(i32 %tmp1, i8 signext %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i64, align 8
+  %tmpcast = bitcast i64* %ref.tmp to %"pair5"*
+  %t0 = bitcast i64* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i64
+  %retval.sroa.2.0.insert.shift.i = shl nuw nsw i64 %retval.sroa.2.0.insert.ext.i, 32
+  %retval.sroa.0.0.insert.ext.i = zext i32 %tmp1 to i64
+  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i64 %retval.sroa.0.0.insert.insert.i, i64* %ref.tmp, align 8
+  call void @goo5(%"pair5"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}
+
+declare void @goo6(%"pair6"* dereferenceable(8)) local_unnamed_addr
+%"pair6" = type { i16, i16 }
+
+; CHECK-LABEL: int16_int16_pair
+; CHECK: movw %si, 2(%rsp)
+; CHECK: movw %di, (%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int16_int16_pair(i16 %tmp1, i16 signext %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i32, align 8
+  %tmpcast = bitcast i32* %ref.tmp to %"pair6"*
+  %t0 = bitcast i32* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %retval.sroa.2.0.insert.ext.i = zext i16 %tmp2 to i32
+  %retval.sroa.2.0.insert.shift.i = shl nuw nsw i32 %retval.sroa.2.0.insert.ext.i, 16
+  %retval.sroa.0.0.insert.ext.i = zext i16 %tmp1 to i32
+  %retval.sroa.0.0.insert.insert.i = or i32 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i32 %retval.sroa.0.0.insert.insert.i, i32* %ref.tmp, align 8
+  call void @goo6(%"pair6"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}
+
+declare void @goo7(%"pair7"* dereferenceable(8)) local_unnamed_addr
+%"pair7" = type { i16, i8 }
+
+; CHECK-LABEL: int16_int8_pair
+; CHECK: movw %di, (%rsp) 
+; CHECK: movzbl	%sil, %eax
+; CHECK: movw %ax, 2(%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int16_int8_pair(i16 %tmp1, i8 signext %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i32, align 8
+  %tmpcast = bitcast i32* %ref.tmp to %"pair7"*
+  %t0 = bitcast i32* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i32
+  %retval.sroa.2.0.insert.shift.i = shl nuw nsw i32 %retval.sroa.2.0.insert.ext.i, 16
+  %retval.sroa.0.0.insert.ext.i = zext i16 %tmp1 to i32
+  %retval.sroa.0.0.insert.insert.i = or i32 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i32 %retval.sroa.0.0.insert.insert.i, i32* %ref.tmp, align 8
+  call void @goo7(%"pair7"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}
+
+declare void @goo8(%"pair8"* dereferenceable(8)) local_unnamed_addr
+%"pair8" = type { i8, i8 }
+
+; CHECK-LABEL: int8_int8_pair
+; CHECK: movb %sil, 1(%rsp)
+; CHECK: movb %dil, (%rsp)
+; CHECK: leaq (%rsp), %rdi
+define void @int8_int8_pair(i8 %tmp1, i8 signext %tmp2) local_unnamed_addr {
+entry:
+  %ref.tmp = alloca i16, align 8
+  %tmpcast = bitcast i16* %ref.tmp to %"pair8"*
+  %t0 = bitcast i16* %ref.tmp to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %t0)
+  %retval.sroa.2.0.insert.ext.i = zext i8 %tmp2 to i16
+  %retval.sroa.2.0.insert.shift.i = shl nuw nsw i16 %retval.sroa.2.0.insert.ext.i, 8
+  %retval.sroa.0.0.insert.ext.i = zext i8 %tmp1 to i16
+  %retval.sroa.0.0.insert.insert.i = or i16 %retval.sroa.2.0.insert.shift.i, %retval.sroa.0.0.insert.ext.i
+  store i16 %retval.sroa.0.0.insert.insert.i, i16* %ref.tmp, align 8
+  call void @goo8(%"pair8"* dereferenceable(8) %tmpcast)
+  call void @llvm.lifetime.end(i64 8, i8* %t0)
+  ret void
+}