Index: llvm/trunk/include/llvm/Target/TargetLowering.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h
+++ llvm/trunk/include/llvm/Target/TargetLowering.h
@@ -335,6 +335,12 @@
     return false;
   }
 
+  /// \brief Return true if it is cheaper to split the store of a merged int val
+  /// from a pair of smaller values into multiple stores.
+  virtual bool isMultiStoresCheaperThanBitsMerge(SDValue Lo, SDValue Hi) const {
+    return false;
+  }
+
   /// \brief Return if the target supports combining a
   /// chain like:
   /// \code
Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -374,6 +374,7 @@
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
@@ -12200,9 +12201,111 @@
       return NewSt;
   }
 
+  if (SDValue NewSt = splitMergedValStore(ST))
+    return NewSt;
+
   return ReduceLoadOpStoreWidth(N);
 }
 
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  SDValue Val = ST->getValue();
+  SDLoc DL(ST);
+
+  // Match OR operand.
+  if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // Match SHL operand and get Lower and Higher parts of Val.
+  SDValue Op1 = Val.getOperand(0);
+  SDValue Op2 = Val.getOperand(1);
+  SDValue Lo, Hi;
+  if (Op1.getOpcode() != ISD::SHL) {
+    std::swap(Op1, Op2);
+    if (Op1.getOpcode() != ISD::SHL)
+      return SDValue();
+  }
+  Lo = Op2;
+  Hi = Op1.getOperand(0);
+  if (!Op1.hasOneUse())
+    return SDValue();
+
+  // Match shift amount to HalfValBitSize.
+  unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2;
+  ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
+  if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
+    return SDValue();
+
+  // Lo and Hi are zero-extended from int with size less equal than 32
+  // to i64.
+  if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
+      !Lo.getOperand(0).getValueType().isScalarInteger() ||
+      Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize ||
+      Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
+      !Hi.getOperand(0).getValueType().isScalarInteger() ||
+      Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize)
+    return SDValue();
+
+  if (!TLI.isMultiStoresCheaperThanBitsMerge(Lo.getOperand(0),
+                                             Hi.getOperand(0)))
+    return SDValue();
+
+  // Start to split store.
+  unsigned Alignment = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+
+  // Change the sizes of Lo and Hi's value types to HalfValBitSize.
+  EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  // Lower value store.
+  SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                             ST->getAlignment(), MMOFlags, AAInfo);
+  Ptr =
+      DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                  DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
+  // Higher value store.
+  SDValue St1 =
+      DAG.getStore(Chain, DL, Hi, Ptr,
+                   ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+                   Alignment / 2, MMOFlags, AAInfo);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1);
+}
+
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h
@@ -764,6 +764,26 @@
       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
     }
 
+    bool isMultiStoresCheaperThanBitsMerge(SDValue Lo,
+                                           SDValue Hi) const override {
+      // If the pair to store is a mixture of float and int values, we will
+      // save two bitwise instructions and one float-to-int instruction and
+      // increase one store instruction. There is potentially a more
+      // significant benefit because it avoids the float->int domain switch
+      // for input value. So It is more likely a win.
+      if (Lo.getOpcode() == ISD::BITCAST || Hi.getOpcode() == ISD::BITCAST) {
+        SDValue Opd = (Lo.getOpcode() == ISD::BITCAST) ? Lo.getOperand(0)
+                                                       : Hi.getOperand(0);
+        if (Opd.getValueType().isFloatingPoint())
+          return true;
+      }
+      // If the pair only contains int values, we will save two bitwise
+      // instructions and increase one store instruction (costing one more
+      // store buffer). Since the benefit is more blurred so we leave
+      // such pair out until we get testcase to prove it is a win.
+      return false;
+    }
+
     bool hasAndNotCompare(SDValue Y) const override;
 
     /// Return the value type to use for ISD::SETCC.
Index: llvm/trunk/test/Transforms/InstCombine/split-store.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/split-store.ll
+++ llvm/trunk/test/Transforms/InstCombine/split-store.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: int32_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movl %edi, (%rsi)
+define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: float_int32_pair
+; CHECK: movl %edi, 4(%rsi)
+; CHECK: movss %xmm0, (%rsi)
+define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp1 to i32
+  %t1 = zext i32 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %t0 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int16_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzwl	%di, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i16 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int8_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzbl	%dil, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i8 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}