Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -124,6 +124,10 @@
     "profile-guided-section-prefix", cl::Hidden, cl::init(true),
     cl::desc("Use profile info to add section prefix for hot/cold functions"));
 
+static cl::opt<bool> ForceSplitStore(
+    "force-split-store", cl::Hidden, cl::init(false),
+    cl::desc("Force store splitting no matter what the target query says."));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
@@ -5263,6 +5267,115 @@
   return false;
 }
 
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+/// Although we already have similar splitting in DAG Combine, we duplicate
+/// it in CodeGenPrepare to catch the case in which pattern is across
+/// multiple BBs. The logic in DAG Combine is kept to catch case generated
+/// during code expansion.
+static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
+                                const TargetLowering &TLI) {
+  Type *StoreType = SI.getValueOperand()->getType();
+  unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
+  Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
+
+  // Handle simple but common cases only.
+  if (DL.getTypeStoreSizeInBits(StoreType) != DL.getTypeSizeInBits(StoreType) ||
+      DL.getTypeStoreSizeInBits(SplitStoreType) !=
+          DL.getTypeSizeInBits(SplitStoreType))
+    return false;
+
+  // Match the following patterns:
+  // (store (or (zext LValue to i64),
+  //            (shl (zext HValue to i64), 32)), HalfValBitSize)
+  //  or
+  // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
+  //            (zext LValue to i64),
+  // Expect both operands of OR and the first operand of SHL have only
+  // one use.
+  Value *LValue, *HValue;
+  if (!match(SI.getValueOperand(),
+             m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
+                    m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
+                                   m_SpecificInt(HalfValBitSize))))))
+    return false;
+
+  // Check LValue and HValue are int with size less or equal than 32.
+  if (!LValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
+      !HValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
+    return false;
+
+  // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
+  // as the input of target query.
+  auto *LBC = dyn_cast<BitCastInst>(LValue);
+  auto *HBC = dyn_cast<BitCastInst>(HValue);
+  EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
+                  : EVT::getEVT(LValue->getType());
+  EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
+                   : EVT::getEVT(HValue->getType());
+  if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
+    return false;
+
+  // Start to split store.
+  IRBuilder<> Builder(SI.getContext());
+  Builder.SetInsertPoint(&SI);
+
+  // If LValue/HValue is a bitcast in another BB, create a new one in current
+  // BB so it may be merged with the splitted stores by dag combiner.
+  if (LBC && LBC->getParent() != SI.getParent())
+    LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
+  if (HBC && HBC->getParent() != SI.getParent())
+    HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
+
+  auto CreateSplitStore = [&](Value *V, bool Upper) {
+    V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
+    Value *Addr = Builder.CreateBitCast(
+        SI.getOperand(1),
+        SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
+    if (Upper)
+      Addr = Builder.CreateGEP(
+          SplitStoreType, Addr,
+          ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
+    Builder.CreateAlignedStore(V, Addr, Upper ? SI.getAlignment() / 2
+                                              : SI.getAlignment());
+  };
+
+  CreateSplitStore(LValue, false);
+  CreateSplitStore(HValue, true);
+
+  // Delete the old store.
+  SI.eraseFromParent();
+  return true;
+}
+
 bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
@@ -5327,6 +5440,8 @@
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (TLI && splitMergedValStore(*SI, *DL, *TLI))
+      return true;
     stripInvariantGroupMetadata(*SI);
     if (TLI) {
       unsigned AS = SI->getPointerAddressSpace();
Index: test/CodeGen/X86/split-store.ll
===================================================================
--- test/CodeGen/X86/split-store.ll
+++ test/CodeGen/X86/split-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -force-split-store < %s | FileCheck %s
 
 ; CHECK-LABEL: int32_float_pair
 ; CHECK: movl %edi, (%rsi)
@@ -57,3 +57,200 @@
   store i64 %t4, i64* %ref.tmp, align 8
   ret void
 }
+
+; CHECK-LABEL: int32_int32_pair
+; CHECK: movl	%edi, (%rdx)
+; CHECK: movl	%esi, 4(%rdx)
+define void @int32_int32_pair(i32 %tmp1, i32 %tmp2, i64* %ref.tmp) {
+entry:
+  %t1 = zext i32 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int16_int16_pair
+; CHECK: movw	%di, (%rdx)
+; CHECK: movw	%si, 2(%rdx)
+define void @int16_int16_pair(i16 signext %tmp1, i16 signext %tmp2, i32* %ref.tmp) {
+entry:
+  %t1 = zext i16 %tmp2 to i32
+  %t2 = shl nuw i32 %t1, 16
+  %t3 = zext i16 %tmp1 to i32
+  %t4 = or i32 %t2, %t3
+  store i32 %t4, i32* %ref.tmp, align 4
+  ret void
+}
+
+; CHECK-LABEL: int8_int8_pair
+; CHECK: movb	%dil, (%rdx)
+; CHECK: movb	%sil, 1(%rdx)
+define void @int8_int8_pair(i8 signext %tmp1, i8 signext %tmp2, i16* %ref.tmp) {
+entry:
+  %t1 = zext i8 %tmp2 to i16
+  %t2 = shl nuw i16 %t1, 8
+  %t3 = zext i8 %tmp1 to i16
+  %t4 = or i16 %t2, %t3
+  store i16 %t4, i16* %ref.tmp, align 2
+  ret void
+}
+
+; CHECK-LABEL: int31_int31_pair
+; CHECK: andl $2147483647, %edi
+; CHECK: movl %edi, (%rdx)
+; CHECK: andl $2147483647, %esi
+; CHECK: movl %esi, 4(%rdx)
+define void @int31_int31_pair(i31 %tmp1, i31 %tmp2, i64* %ref.tmp) {
+entry:
+  %t1 = zext i31 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i31 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int31_int17_pair
+; CHECK: andl $2147483647, %edi
+; CHECK: movl %edi, (%rdx)
+; CHECK: andl $131071, %esi
+; CHECK: movl %esi, 4(%rdx)
+define void @int31_int17_pair(i31 %tmp1, i17 %tmp2, i64* %ref.tmp) {
+entry:
+  %t1 = zext i17 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i31 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int7_int3_pair
+; CHECK: andb $127, %dil
+; CHECK: movb %dil, (%rdx)
+; CHECK: andb $7, %sil
+; CHECK: movb %sil, 1(%rdx)
+define void @int7_int3_pair(i7 signext %tmp1, i3 signext %tmp2, i16* %ref.tmp) {
+entry:
+  %t1 = zext i3 %tmp2 to i16
+  %t2 = shl nuw i16 %t1, 8
+  %t3 = zext i7 %tmp1 to i16
+  %t4 = or i16 %t2, %t3
+  store i16 %t4, i16* %ref.tmp, align 2
+  ret void
+}
+
+; CHECK-LABEL: int24_int24_pair
+; CHECK: movw	%di, (%rdx)
+; CHECK: shrl	$16, %edi
+; CHECK: movb	%dil, 2(%rdx)
+; CHECK: movl	%esi, %eax
+; CHECK: shrl	$16, %eax
+; CHECK: movb	%al, 6(%rdx)
+; CHECK: movw	%si, 4(%rdx)
+define void @int24_int24_pair(i24 signext %tmp1, i24 signext %tmp2, i48* %ref.tmp) {
+entry:
+  %t1 = zext i24 %tmp2 to i48
+  %t2 = shl nuw i48 %t1, 24
+  %t3 = zext i24 %tmp1 to i48
+  %t4 = or i48 %t2, %t3
+  store i48 %t4, i48* %ref.tmp, align 2
+  ret void
+}
+
+; getTypeSizeInBits(i12) != getTypeStoreSizeInBits(i12), so store split doesn't kick in.
+; CHECK-LABEL: int12_int12_pair
+; CHECK: movl	%esi, %eax
+; CHECK: shll	$12, %eax
+; CHECK: andl	$4095, %edi
+; CHECK: orl	%eax, %edi
+; CHECK: shrl	$4, %esi
+; CHECK: movb	%sil, 2(%rdx)
+; CHECK: movw	%di, (%rdx)
+define void @int12_int12_pair(i12 signext %tmp1, i12 signext %tmp2, i24* %ref.tmp) {
+entry:
+  %t1 = zext i12 %tmp2 to i24
+  %t2 = shl nuw i24 %t1, 12
+  %t3 = zext i12 %tmp1 to i24
+  %t4 = or i24 %t2, %t3
+  store i24 %t4, i24* %ref.tmp, align 2
+  ret void
+}
+
+; getTypeSizeInBits(i14) != getTypeStoreSizeInBits(i14), so store split doesn't kick in.
+; CHECK-LABEL: int7_int7_pair
+; CHECK: movzbl	%sil, %eax
+; CHECK: shll	$7, %eax
+; CHECK: andb	$127, %dil
+; CHECK: movzbl	%dil, %ecx
+; CHECK: orl	%eax, %ecx
+; CHECK: andl	$16383, %ecx
+; CHECK: movw	%cx, (%rdx)
+define void @int7_int7_pair(i7 signext %tmp1, i7 signext %tmp2, i14* %ref.tmp) {
+entry:
+  %t1 = zext i7 %tmp2 to i14
+  %t2 = shl nuw i14 %t1, 7
+  %t3 = zext i7 %tmp1 to i14
+  %t4 = or i14 %t2, %t3
+  store i14 %t4, i14* %ref.tmp, align 2
+  ret void
+}
+
+; getTypeSizeInBits(i2) != getTypeStoreSizeInBits(i2), so store split doesn't kick in.
+; CHECK-LABEL: int1_int1_pair
+; CHECK: addb %sil, %sil
+; CHECK: andb $1, %dil
+; CHECK: orb %sil, %dil
+; CHECK: andb $3, %dil
+; CHECK: movb %dil, (%rdx)
+define void @int1_int1_pair(i1 signext %tmp1, i1 signext %tmp2, i2* %ref.tmp) {
+entry:
+  %t1 = zext i1 %tmp2 to i2
+  %t2 = shl nuw i2 %t1, 1
+  %t3 = zext i1 %tmp1 to i2
+  %t4 = or i2 %t2, %t3
+  store i2 %t4, i2* %ref.tmp, align 1
+  ret void
+}
+
+; CHECK-LABEL: mbb_int32_float_pair
+; CHECK: movl %edi, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
+define void @mbb_int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  br label %next
+next:
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: mbb_int32_float_multi_stores
+; CHECK: movl %edi, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: # %bb2
+; CHECK: movl %edi, (%rdx)
+; CHECK: movss %xmm0, 4(%rdx)
+define void @mbb_int32_float_multi_stores(i32 %tmp1, float %tmp2, i64* %ref.tmp, i64* %ref.tmp1, i1 %cmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  br label %bb1
+bb1:
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  br i1 %cmp, label %bb2, label %exitbb
+bb2:
+  store i64 %t4, i64* %ref.tmp1, align 8
+  br label %exitbb
+exitbb:
+  ret void
+}