Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -5179,6 +5179,127 @@
   return false;
 }
 
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+/// Although we already have similar splitting in DAG Combine, we duplicate
+/// it in CodeGenPrepare to catch the case in which pattern is across
+/// multiple BBs. The logic in DAG Combine is kept to catch case generated
+/// during code expansion.
+static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
+                                const TargetLowering &TLI) {
+  // Match OR operand.
+  BinaryOperator *OR = dyn_cast<BinaryOperator>(SI.getValueOperand());
+  if (!OR || OR->getOpcode() != Instruction::Or ||
+      OR->getParent() != SI.getParent())
+    return false;
+
+  // Match SHL operand and get Lower and Higher parts of Val.
+  Value *Op1 = OR->getOperand(0);
+  Value *Op2 = OR->getOperand(1);
+  BinaryOperator *SHL = dyn_cast<BinaryOperator>(Op1);
+  if (!SHL || SHL->getOpcode() != Instruction::Shl) {
+    std::swap(Op1, Op2);
+    SHL = dyn_cast<BinaryOperator>(Op1);
+    if (!SHL || SHL->getOpcode() != Instruction::Shl)
+      return false;
+  }
+  if (!SHL->hasOneUse())
+    return false;
+
+  // Match shift amount to HalfValBitSize.
+  unsigned HalfValBitSize =
+      DL.getTypeSizeInBits(SI.getValueOperand()->getType()) / 2;
+  ConstantInt *CI = dyn_cast<ConstantInt>(SHL->getOperand(1));
+  if (!CI || CI->getValue() != HalfValBitSize)
+    return false;
+
+  // Check ZL and ZH are zero-extended from int with size less equal than 32
+  // to i64.
+  ZExtInst *ZL = dyn_cast<ZExtInst>(Op2);
+  ZExtInst *ZH = dyn_cast<ZExtInst>(SHL->getOperand(0));
+  if (!ZL || !ZL->hasOneUse() || !ZH || !ZH->hasOneUse())
+    return false;
+  Value *LValue = ZL->getOperand(0);
+  Value *HValue = ZH->getOperand(0);
+  if (!LValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
+      !HValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
+    return false;
+
+  // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
+  // as the input of target query.
+  EVT LowTy = EVT::getEVT(LValue->getType());
+  EVT HighTy = EVT::getEVT(HValue->getType());
+  BasicBlock *CurBB = OR->getParent();
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(LValue))
+    LowTy = EVT::getEVT(BC->getOperand(0)->getType(), true);
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(HValue))
+    HighTy = EVT::getEVT(BC->getOperand(0)->getType(), true);
+
+  if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
+    return false;
+
+  // Start to split store.
+  IRBuilder<> Builder(SI.getContext());
+
+  // If LValue/HValue is a bitcast in another BB and has only one use, move
+  // it to current BB so it may be merged with the splitted stores by dag
+  // combiner.
+  BitCastInst *LBC = dyn_cast<BitCastInst>(LValue);
+  if (LBC && LBC->hasOneUse() && LBC->getParent() != CurBB) {
+    Builder.SetInsertPoint(ZL);
+    LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
+  }
+  BitCastInst *HBC = dyn_cast<BitCastInst>(HValue);
+  if (HBC && HBC->hasOneUse() && HBC->getParent() != CurBB) {
+    Builder.SetInsertPoint(ZH);
+    HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
+  }
+
+  Builder.SetInsertPoint(&SI);
+  Type *Ty = Type::getIntNTy(SI.getContext(), HalfValBitSize);
+  Type *PtrTy = Ty->getPointerTo(SI.getPointerAddressSpace());
+  Value *Low = Builder.CreateZExtOrBitCast(LValue, Ty);
+  Value *LowAddr = Builder.CreateBitCast(SI.getOperand(1), PtrTy);
+  Builder.CreateAlignedStore(Low, LowAddr, SI.getAlignment());
+
+  Value *High = Builder.CreateZExtOrBitCast(HValue, Ty);
+  Value *HighAddr = Builder.CreateGEP(
+      Ty, LowAddr, ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
+  Builder.CreateAlignedStore(High, HighAddr, SI.getAlignment() / 2);
+
+  // Delete the old store and the bitwise instructions generating int64.
+  SI.eraseFromParent();
+  return true;
+}
+
 bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
@@ -5243,6 +5364,8 @@
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (TLI && splitMergedValStore(*SI, *DL, *TLI))
+      return true;
     stripInvariantGroupMetadata(*SI);
     if (TLI) {
       unsigned AS = SI->getPointerAddressSpace();
Index: test/CodeGen/X86/split-store.ll
===================================================================
--- test/CodeGen/X86/split-store.ll
+++ test/CodeGen/X86/split-store.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
 ; CHECK-LABEL: int32_float_pair
-; CHECK: movss %xmm0, 4(%rsi)
 ; CHECK: movl %edi, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
 define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp2 to i32
@@ -15,8 +15,8 @@
 }
 
 ; CHECK-LABEL: float_int32_pair
-; CHECK: movl %edi, 4(%rsi)
 ; CHECK: movss %xmm0, (%rsi)
+; CHECK: movl %edi, 4(%rsi)
 define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp1 to i32
@@ -29,9 +29,9 @@
 }
 
 ; CHECK-LABEL: int16_float_pair
-; CHECK: movss %xmm0, 4(%rsi)
 ; CHECK: movzwl	%di, %eax
 ; CHECK: movl %eax, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
 define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp2 to i32
@@ -44,9 +44,9 @@
 }
 
 ; CHECK-LABEL: int8_float_pair
-; CHECK: movss %xmm0, 4(%rsi)
 ; CHECK: movzbl	%dil, %eax
 ; CHECK: movl %eax, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
 define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp2 to i32
@@ -57,3 +57,19 @@
   store i64 %t4, i64* %ref.tmp, align 8
   ret void
 }
+
+; CHECK-LABEL: mbb_int32_float_pair
+; CHECK: movl %edi, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
+define void @mbb_int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  br label %next
+next:
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}