Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -448,6 +448,7 @@
     SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue foldRedundantShiftedMasks(SDNode *N);
+    SDNode *shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
@@ -13042,6 +13043,108 @@
                 St->getPointerInfo().getWithOffset(StOffset), NewAlign)
       .getNode();
 }
+/// Replaces patterns that copy half of an memory element to the other half. Ex:
+// store i16 (or (load i8 from M), (shl (load i8 from M) 8 ) ) into M
+// can be replaced by store i8 (load i8 from M) into M+1
+SDNode *
+DAGCombiner::shrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *Store) {
+  SDValue OR = Store->getValue();
+  assert(OR.getOpcode() == ISD::OR && "Expecting ISD::OR");
+  if (OR.getOpcode() != ISD::OR)
+    return nullptr;
+
+  SDValue LoadSD = OR.getOperand(0);
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(LoadSD);
+  SDNode *OtherORop = OR->getOperand(1).getNode();
+  if (!Load) {
+    OtherORop = OR->getOperand(0).getNode();
+    LoadSD = OR->getOperand(1);
+    Load = dyn_cast<LoadSDNode>(LoadSD);
+    if (!Load)
+      return nullptr;
+  }
+
+  unsigned BytesShift = 0;
+
+  if ((OtherORop->getOpcode() == ISD::SHL) &&
+      (OtherORop->getOperand(0).getNode() == Load) &&
+      isa<ConstantSDNode>(OtherORop->getOperand(1)))
+    BytesShift = cast<ConstantSDNode>(OtherORop->getOperand(1).getNode())
+                     ->getAPIntValue()
+                     .getSExtValue() /
+                 8;
+  // TODO: Accept other shifting operations such as srl, sra. Could
+  // use negative values for BytesShift
+  LLVM_DEBUG(dbgs() << "\tGot load: "; Load->dump());
+  LLVM_DEBUG(dbgs() << "\tThat is shifted by: "; OtherORop->dump());
+  LLVM_DEBUG(dbgs() << "\tOR combined by: "; OR->dump());
+  LLVM_DEBUG(dbgs() << "\tAnd stored by: "; Store->dump());
+
+  unsigned StoreMemSz = Store->getMemoryVT().getStoreSize();
+  // For now we only accept chains that moves half of the loaded value to the
+  // other half.
+  if (2 * BytesShift != StoreMemSz)
+    return nullptr;
+
+  const SDValue LoadPtr = Load->getBasePtr();
+  SDValue Ptr = Store->getBasePtr();
+  unsigned LoadMemSz = Load->getMemoryVT().getStoreSize();
+  // TODO: Detect when both LOAD and STORE memory addresses are both ADD
+  // instructions to a common base address, with a known constant difference
+  // Ex: load i8 [M+3] and store i16 [M+2]
+  if (LoadPtr == Ptr) {
+    if ((LoadMemSz < StoreMemSz) && Load->getExtensionType() != ISD::ZEXTLOAD)
+      return nullptr;
+
+    if (LoadMemSz == BytesShift) {
+      // replace something like
+      //  store i16( or(ld i8 [M] zext i16), (shl ( ld i8 [M] ), 8) ),[M]
+      // by
+      //  store i8( ld i8 [M]) [M+1]
+      LLVM_DEBUG(dbgs() << "\tNot writing the lower half of the store\n");
+      return ShrinkLoadReplaceStoreWithStore({LoadMemSz, 0}, LoadSD, Store,
+                                             this);
+    }
+    return nullptr;
+  }
+
+  if (LoadPtr.getOpcode() != ISD::ADD)
+    return nullptr;
+
+  // Detect if we are moving M[A+k] to M[A]:
+  if (!((LoadPtr.getOperand(0) == Ptr) || (LoadPtr.getOperand(1) == Ptr)))
+    return nullptr;
+
+  ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(LoadPtr.getOperand(1));
+  if (!Offset)
+    Offset = dyn_cast<ConstantSDNode>(LoadPtr.getOperand(0));
+
+  if (!Offset)
+    return nullptr;
+
+  unsigned LoadByteOffset = Offset->getAPIntValue().getZExtValue();
+  if ((LoadByteOffset == LoadMemSz) && (2 * LoadMemSz == StoreMemSz) &&
+      Load->getExtensionType() == ISD::ZEXTLOAD) {
+    // Replace something like
+    // store i16( or(ld i8 [M+1]), (shl ( ld i8 [M+1]), 8) ),[M]
+    // by
+    //  store i8( ld i8 [M+1]) [M]. The ld must be zext.
+    MVT VT = MVT::getIntegerVT(LoadMemSz * 8);
+    if (!isTypeLegal(VT))
+      return nullptr;
+
+    // Truncate down to the new size.
+    SDValue IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(OR), VT, LoadSD);
+
+    ++OpsNarrowed;
+    LLVM_DEBUG(dbgs() << "\tNot writing the upper half of the store\n");
+    return DAG
+        .getStore(Store->getChain(), SDLoc(Store), IVal, Ptr,
+                  Store->getPointerInfo(), LoadMemSz)
+        .getNode();
+  }
+  return nullptr;
+}
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
@@ -13057,11 +13160,19 @@
   SDValue Ptr   = ST->getBasePtr();
   EVT VT = Value.getValueType();
 
-  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+  if (VT.isVector() || !Value.hasOneUse())
     return SDValue();
 
   unsigned Opc = Value.getOpcode();
 
+  if ((Opc == ISD::OR) && VT.isScalarInteger()) {
+    if (SDNode *NewSt = shrinkLoadShiftOrStoreWithLoadNewStore(ST))
+      return SDValue(NewSt, 0);
+  }
+
+  if (ST->isTruncatingStore())
+    return SDValue();
+
   // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
   // is a byte mask indicating a consecutive number of bytes, check to see if
   // Y is known to provide just those bytes.  If so, we try to replace the
Index: lib/Transforms/Utils/SimplifyIndVar.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyIndVar.cpp
+++ lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -784,7 +784,7 @@
     for (unsigned N = 0; IVOperand; ++N) {
       assert(N <= Simplified.size() && "runaway iteration");
 
-      Value *NewOper = foldIVUser(UseInst, IVOperand);
+      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
       if (!NewOper)
         break; // done folding
       IVOperand = dyn_cast<Instruction>(NewOper);
@@ -792,12 +792,12 @@
     if (!IVOperand)
       continue;
 
-    if (eliminateIVUser(UseInst, IVOperand)) {
+    if (eliminateIVUser(UseOper.first, IVOperand)) {
       pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
       continue;
     }
 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
       if ((isa<OverflowingBinaryOperator>(BO) &&
            strengthenOverflowingOperation(BO, IVOperand)) ||
           (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
@@ -807,13 +807,13 @@
       }
     }
 
-    CastInst *Cast = dyn_cast<CastInst>(UseInst);
+    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
     if (V && Cast) {
       V->visitCast(Cast);
       continue;
     }
-    if (isSimpleIVUser(UseInst, L, SE)) {
-      pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
+    if (isSimpleIVUser(UseOper.first, L, SE)) {
+      pushIVUsers(UseOper.first, L, Simplified, SimpleIVUsers);
     }
   }
 }
Index: test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/2018_05_30_FoldMakedMoves.ll
@@ -0,0 +1,67 @@
+; RUN: llc -O3 -march=arm %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-arm-none-eabi"
+define void @foo1(i16* %b){
+entry:
+  %0 = load i16, i16* %b, align 2
+  %conv = sext i16 %0 to i32
+  %and = and i32 %conv, 65280
+  %1 = lshr i32 %conv, 8
+  %and3 = and i32 %1, 255
+  %or = or i32 %and3, %and
+  %conv4 = trunc i32 %or to i16
+  store i16 %conv4, i16* %b, align 2
+  ret void
+}
+; CHECK-LABEL: foo1
+; CHECK: ldrb	r1, [r0, #1]
+; CHECK-NEXT: strb	r1, [r0]
+
+define void @foo2(i32* %b){
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = lshr i32 %0, 16
+  %and2 = and i32 %1, 65535
+  %and = and i32 %0, 4294901760
+  %or = or i32 %and2, %and
+  store i32 %or, i32* %b, align 4
+  ret void
+}
+; CHECK-LABEL: foo2
+; CHECK: ldrh	r1, [r0, #2]
+; CHECK-NEXT: strh	r1, [r0]
+
+define void @test_1x4p1(i32* %M, i32 %I) {
+entry:
+  %0 = getelementptr inbounds i32, i32* %M, i32 %I
+  %1 = load i32, i32* %0, align 4
+  %2 = and i32 %1, 65280
+  %3 = lshr i32 %1, 8
+  %4 = and i32 %3, 255
+  %5 = or i32 %2, %4
+  store i32 %5, i32* %0, align 4
+  ret void
+}
+; CHECK-LABEL: test_1x4p1:
+; CHECK: ldrb{{.*}}
+; CHECK-NEXT: orr{{.*}}
+; CHECK-NEXT: str{{.*}}
+
+
+define void @test_1x4p1_shl(i32* %M, i32 %I) {
+entry:
+  %0 = getelementptr inbounds i32, i32* %M, i32 %I
+  %1 = load i32, i32* %0, align 4
+  %2 = and i32 %1, 65280
+  %3 = shl i32 %1, 8
+  %4 = and i32 %3, 16711680
+  %5 = or i32 %2, %4
+  store i32 %5, i32* %0, align 4
+  ret void
+}
+; CHECK-LABEL: test_1x4p1_shl:
+; CHECK: ldrb{{.*}}
+; CHECK-NEXT: lsl{{.*}}
+; CHECK-NEXT: orr{{.*}}
+; CHECK-NEXT: str{{.*}}
+