Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -449,6 +449,7 @@
     SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue foldRedudantShiftedMasks(SDNode *N);
+    SDNode *shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
@@ -12998,6 +12999,83 @@
                 St->getPointerInfo().getWithOffset(StOffset), NewAlign)
       .getNode();
 }
+/// Replaces patterns such as:
+// store 2 ( or (load 1 from M), (shl (load 1 from M) 8 ) ) into M
+// by store 1 ( load 1 from M) into M+1
+SDNode *
+DAGCombiner::shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store) {
+  SDValue OR = Store->getValue();
+  if (OR.getOpcode() != ISD::OR)
+    llvm_unreachable("Assuming to be called with an OR operad.");
+  SDValue LoadSD = OR.getOperand(0);
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(LoadSD);
+  SDNode *OtherORop = OR->getOperand(1).getNode();
+  if (!Load) {
+    OtherORop = OR->getOperand(0).getNode();
+    LoadSD = OR->getOperand(1);
+    Load = dyn_cast<LoadSDNode>(LoadSD);
+    if (!Load)
+      return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "\tGot load: "; Load->dump());
+
+  unsigned BytesShift = 0;
+
+  if ((OtherORop->getOpcode() == ISD::SHL) &&
+      (OtherORop->getOperand(0).getNode() == Load) &&
+      isa<ConstantSDNode>(OtherORop->getOperand(1)))
+    BytesShift = cast<ConstantSDNode>(OtherORop->getOperand(1).getNode())
+                     ->getAPIntValue()
+                     .getSExtValue() /
+                 8;
+  LLVM_DEBUG(dbgs() << "\tThat is shifted by: "; OtherORop->dump());
+  LLVM_DEBUG(dbgs() << "\tAnd stored by: "; Store->dump());
+  // TODO: Accept other shifting operations such as srl, sra. Can use a negative
+  // value for BytesShift
+
+  unsigned StoreMemSz = Store->getMemoryVT().getStoreSize();
+  // For now we only accept chains that moves half of the loaded value to the
+  // other half.
+  if (2 * BytesShift != StoreMemSz)
+    return nullptr;
+
+  const SDValue LoadPtr = Load->getBasePtr();
+  SDValue Ptr = Store->getBasePtr();
+  // TODO: Detect when both LOAD and STORE memory addresses are both ADD
+  // instructions to a common base address, with a known constant difference
+  bool SamePtr = LoadPtr == Ptr;
+  if (!SamePtr && (LoadPtr.getOpcode() != ISD::ADD))
+    return nullptr;
+
+  // Detect if we are moving M[A+k] to M[A]:
+  if (!((LoadPtr.getOperand(0) == Ptr) || (LoadPtr.getOperand(1) == Ptr)))
+    return nullptr;
+
+  ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(LoadPtr.getOperand(1));
+  if (!Offset)
+    Offset = dyn_cast<ConstantSDNode>(LoadPtr.getOperand(0));
+
+  if (!Offset)
+    return nullptr;
+
+  unsigned LoadByteOffset = Offset->getAPIntValue().getZExtValue();
+
+  unsigned LoadMemSz = Load->getMemoryVT().getStoreSize();
+  bool UpperHalfLoad =
+      ((LoadByteOffset == LoadMemSz) && (2 * LoadMemSz == StoreMemSz));
+
+  if (!(UpperHalfLoad || SamePtr))
+    return nullptr;
+
+  //  if (SamePtr) {TODO
+  //    if ((LoadMemSz == StoreMemSz) && (Load->use_size() == 2)) {
+  //      LLVM_DEBUG(dbgs() << "Reduce load width\n");
+  //    }
+  //    LLVM_DEBUG(dbgs() << "Move lower to upper half\n");
+  //  }
+  LLVM_DEBUG(dbgs() << "\tReduce store width to half width.\n");
+  return ShrinkLoadReplaceStoreWithStore({LoadMemSz, 0}, LoadSD, Store, this);
+}
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
@@ -13013,11 +13091,19 @@
   SDValue Ptr   = ST->getBasePtr();
   EVT VT = Value.getValueType();
 
-  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+  if (VT.isVector() || !Value.hasOneUse())
     return SDValue();
 
   unsigned Opc = Value.getOpcode();
 
+  if ((Opc == ISD::OR) && VT.isScalarInteger()) {
+    if (SDNode *NewSt = shrinkLoadShiftOrStoreWithLoadNewStore(ST))
+      return SDValue(NewSt, 0);
+  }
+
+  if (ST->isTruncatingStore())
+    return SDValue();
+
   // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
   // is a byte mask indicating a consecutive number of bytes, check to see if
   // Y is known to provide just those bytes.  If so, we try to replace the
Index: test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll
===================================================================
--- test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll
+++ test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll
@@ -0,0 +1,67 @@
+; RUN: llc -O3 -march=arm %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-arm-none-eabi"
+define void @foo1(i16* %b){
+entry:
+  %0 = load i16, i16* %b, align 2
+  %conv = sext i16 %0 to i32
+  %and = and i32 %conv, 65280
+  %1 = lshr i32 %conv, 8
+  %and3 = and i32 %1, 255
+  %or = or i32 %and3, %and
+  %conv4 = trunc i32 %or to i16
+  store i16 %conv4, i16* %b, align 2
+  ret void
+}
+; CHECK-LABEL: foo1
+; CHECK: ldrb	r1, [r0, #1]
+; CHECK-NEXT: strb	r1, [r0]
+
+define void @foo2(i32* %b){
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = lshr i32 %0, 16
+  %and2 = and i32 %1, 65535
+  %and = and i32 %0, 4294901760
+  %or = or i32 %and2, %and
+  store i32 %or, i32* %b, align 4
+  ret void
+}
+; CHECK-LABEL: foo2
+; CHECK: ldrh	r1, [r0, #2]
+; CHECK-NEXT: strh	r1, [r0]
+
+define void @test_1x4p1(i32* %M, i32 %I) {
+entry:
+  %0 = getelementptr inbounds i32, i32* %M, i32 %I
+  %1 = load i32, i32* %0, align 4
+  %2 = and i32 %1, 65280
+  %3 = lshr i32 %1, 8
+  %4 = and i32 %3, 255
+  %5 = or i32 %2, %4
+  store i32 %5, i32* %0, align 4
+  ret void
+}
+; CHECK-LABEL: test_1x4p1:
+; CHECK: ldrb{{.*}}
+; CHECK-NEXT: orr{{.*}}
+; CHECK-NEXT: str{{.*}}
+
+
+define void @test_1x4p1_shl(i32* %M, i32 %I) {
+entry:
+  %0 = getelementptr inbounds i32, i32* %M, i32 %I
+  %1 = load i32, i32* %0, align 4
+  %2 = and i32 %1, 65280
+  %3 = shl i32 %1, 8
+  %4 = and i32 %3, 16711680
+  %5 = or i32 %2, %4
+  store i32 %5, i32* %0, align 4
+  ret void
+}
+; CHECK-LABEL: test_1x4p1_shl:
+; CHECK: ldrb{{.*}}
+; CHECK-NEXT: lsl{{.*}}
+; CHECK-NEXT: orr{{.*}}
+; CHECK-NEXT: str{{.*}}
+