Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -278,6 +278,7 @@
     /// target-specific DAG combines.
     SDValue combine(SDNode *N);
 
+    SDValue combine_AND_ShiftAND(SDNode *N, SDValue &N0, SDValue &N1);
     // Visitation implementation - Implement dag node combining for different
     // node types.  The semantics are as follows:
     // Return Value:
@@ -453,6 +454,9 @@
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
+    SDNode *ShrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *storeOp,
+                                                   const SDNode *orOp);
+
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
@@ -4005,6 +4009,85 @@
   return false;
 }
 
+// fold expressions x1 and x2 alike:
+// x1 = ( and, x, 0x00FF )
+// x2 = (( shl x, 8 ) and 0xFF00 )
+// into
+// x2 = shl x1, 8 ; reuse the computation of x1
+SDValue DAGCombiner::combine_AND_ShiftAND(SDNode *N, SDValue &N0, SDValue &N1) {
+  ConstantSDNode *mask = dyn_cast<ConstantSDNode>(N1);
+  if (!mask)
+    return SDValue();
+
+  if ((N0.getNumOperands() != 2) || (!N0.hasOneUse()))
+    return SDValue();
+
+  ConstantSDNode *shiftAmount = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!shiftAmount)
+    return SDValue();
+
+  const ISD::NodeType N0Opcode = (ISD::NodeType)N0.getOpcode();
+  if (((N0Opcode < ISD::SHL) || (N0Opcode > ISD::ROTR)) &&
+      ((N0Opcode < ISD::SHL_PARTS) || (N0Opcode > ISD::SRL_PARTS)))
+    return SDValue();
+
+  const auto &maskedValue = dyn_cast<SDNode>(N0.getOperand(0));
+  for (SDNode *otherUser : maskedValue->uses()) {
+    SDNode *shiftOperand = dyn_cast<SDNode>(N0);
+    if ((shiftOperand == nullptr) || (&(*otherUser) == shiftOperand) or
+        (otherUser->getOpcode() != ISD::AND))
+      continue;
+
+    ConstantSDNode *otherMask =
+        dyn_cast<ConstantSDNode>(otherUser->getOperand(1));
+    if (!otherMask)
+      continue;
+
+    bool canReduce = false;
+
+    const APInt &maskValue = mask->getAPIntValue();
+    const APInt &shiftValue = shiftAmount->getAPIntValue();
+    const APInt &otherMaskValue = otherMask->getAPIntValue();
+    switch (N0Opcode) {
+    case ISD::SHL:
+      canReduce = (maskValue.lshr(shiftValue) == otherMaskValue);
+      break;
+    case ISD::SRA:
+    case ISD::SRL:
+      canReduce = (maskValue.shl(shiftValue) == otherMaskValue);
+      break;
+    case ISD::ROTL:
+      canReduce = (maskValue.rotr(shiftValue) == otherMaskValue);
+      break;
+    case ISD::ROTR:
+      canReduce = (maskValue.rotl(shiftValue) == otherMaskValue);
+      break;
+    case ISD::SHL_PARTS:
+    case ISD::SRA_PARTS:
+    case ISD::SRL_PARTS:
+      DEBUG(dbgs() << "Todo\n");
+      break;
+    default:
+      llvm_unreachable("This opcode is not accepted!");
+      break;
+    }
+    if (canReduce) {
+      DEBUG(dbgs() << " with:     "; N0.getNode()->dump();
+            dbgs() << " and :     "; otherUser->dump(););
+
+      SDValue shiftTheAND(otherUser, 0);
+      const SDLoc DL(N0);
+      EVT VT = N->getValueType(0);
+      SDValue newShift =
+          DAG.getNode(N0Opcode, DL, VT, shiftTheAND, N0.getOperand(1));
+      AddToWorklist(maskedValue);
+      AddToWorklist(otherUser);
+      return newShift;
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4205,6 +4288,9 @@
                                 (N0.getOpcode() == ISD::ANY_EXTEND &&
                                  N0.getOperand(0).getOpcode() == ISD::LOAD))) {
     if (SDValue Res = ReduceLoadWidth(N)) {
+      if (Res.getOpcode() == ISD::SHL)
+        return Res;
+
       LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
         ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
 
@@ -4214,6 +4300,9 @@
     }
   }
 
+  if (SDValue r = combine_AND_ShiftAND(N, N0, N1))
+    return r;
+
   if (Level >= AfterLegalizeTypes) {
     // Attempt to propagate the AND back up to the leaves which, if they're
     // loads, can be combined to narrow loads and the AND node can be removed.
@@ -6262,13 +6351,36 @@
   }
 
   // fold (srl (shl x, c), c) -> (and x, cst2)
-  if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
-      isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
-    SDLoc DL(N);
-    SDValue Mask =
-        DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
-    AddToWorklist(Mask.getNode());
-    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
+  if ((N0.getOpcode() == ISD::SHL) &&
+      (isConstantOrConstantVector(N1, /* NoOpaques */ true))) {
+    bool canFold = N0.getOperand(1) == N1;
+    if (!canFold) {
+      const ConstantSDNode *CN0N1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (CN0N1 && N1C)
+        canFold = CN0N1->getZExtValue() == N1C->getZExtValue();
+    }
+
+    if (canFold) {
+      // fold (srl (shl x, c), c) -> (c) if the x upper bits of c are known to
+      // be 0
+      // TODO: Add more instructions that produce known upper bits zero masks,
+      // other than zext loads
+      if (N1C) {
+        if (LoadSDNode *x = dyn_cast<LoadSDNode>(N0.getOperand(0))) {
+          const unsigned xSize = x->getValueSizeInBits(0);
+          const unsigned xMemSize = x->getMemOperand()->getSize() * 8;
+          if ((xSize > xMemSize) &&
+              ((xSize - xMemSize) >= N1C->getZExtValue()) &&
+              (x->getExtensionType() == ISD::LoadExtType::ZEXTLOAD))
+            return N0.getOperand(0);
+        }
+      }
+      SDLoc DL(N);
+      SDValue Mask =
+          DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
+      AddToWorklist(Mask.getNode());
+      return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
+    }
   }
 
   // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
@@ -8504,6 +8616,9 @@
   if (VT.isVector())
     return SDValue();
 
+  unsigned ShAmt = 0;
+  unsigned ShLeftAmt = 0;
+
   // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
   // extended to VT.
   if (Opc == ISD::SIGN_EXTEND_INREG) {
@@ -8531,15 +8646,58 @@
   } else if (Opc == ISD::AND) {
     // An AND with a constant mask is the same as a truncate + zero-extend.
     auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (!AndC || !AndC->getAPIntValue().isMask())
+    const APInt &maskAPInt = AndC->getAPIntValue();
+    // TODO: Not only [shifted] masks should be accepted.
+    //(and ld.16 [M], 0x00AB) can be replaced by (and ld.8.zext16 [M], 0x00AB).
+    if (!AndC || !(maskAPInt.isMask() || maskAPInt.isShiftedMask()))
+      return SDValue();
+
+    unsigned maxBit = maskAPInt.getBitWidth() - maskAPInt.countLeadingZeros();
+    const unsigned minBit = maskAPInt.countTrailingZeros();
+    // Only accepts multiples of 8 bits, and power of 2 sizes
+    if ((maxBit | minBit) % 8)
       return SDValue();
 
-    unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
+    unsigned ActiveBits = maxBit - minBit;
+    if (ActiveBits & (ActiveBits - 1))
+      return SDValue();
+
+    DEBUG(dbgs() << "\n\tMask: "; AndC->dump();
+          dbgs() << "\n\t\tmaxActiveBit: " << maxBit - 1
+                 << "\n\t\tminActiveBit: " << minBit << '\n');
+
+    LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0);
     ExtType = ISD::ZEXTLOAD;
+    if (minBit != 0) {
+      // How to treat if it was not a load?
+      if (LN0 == nullptr)
+        return SDValue();
+
+      const auto &mvt = LN0->getMemoryVT();
+      if (minBit >= mvt.getSizeInBits()) {
+        // The (and) is filtering what was extended, not the actual data
+        // value...
+        if (ISD::LoadExtType::ZEXTLOAD == LN0->getExtensionType()) {
+          // We only read the zero values
+          return DAG.getConstant(0, SDLoc(N), AndC->getValueType(0));
+        }
+        // We access the sign extension, not known here
+        return SDValue();
+      }
+      if (maxBit > mvt.getSizeInBits())
+        ExtType = LN0->getExtensionType();
+    }
+    // TODO: Accept SEXT if the architecture accepts doing a LD?SH (load + shl)
+    // An (and (ld.32bit.sext.from16 [M]), 0x00FFFF00) can be replaced by
+    //   (and (shl (ld.32bit.sext.from8 [M+1]), 8), 0x00FFFF00)
+    if (ExtType != ISD::ZEXTLOAD)
+      return SDValue();
+
     ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+    ShAmt = minBit;
+    ShLeftAmt = minBit;
   }
 
-  unsigned ShAmt = 0;
   if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
     SDValue SRL = N0;
     if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
@@ -8590,7 +8748,6 @@
 
   // If the load is shifted left (and the result isn't shifted back right),
   // we can fold the truncate through the shift.
-  unsigned ShLeftAmt = 0;
   if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
       ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
     if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
@@ -12831,6 +12988,85 @@
                 St->getPointerInfo().getWithOffset(StOffset), NewAlign)
       .getNode();
 }
+/// Detects operations such as M[i] = M[i] | M[i] << K, or
+///                            M[i] = M[i] | M[i] >> K,
+/// where K is half size of the store width. If we can prove it only copies
+/// bytes from one position to another, it might be possible to reduce the widht
+/// of the store and remove the 'or + shift' operations.
+SDNode *
+DAGCombiner::ShrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *storeOp,
+                                                    const SDNode *orOp) {
+
+  SDValue loadSD = orOp->getOperand(0);
+  LoadSDNode *load = dyn_cast<LoadSDNode>(loadSD);
+  SDNode *other = orOp->getOperand(1).getNode();
+  if (!load) {
+    loadSD = orOp->getOperand(1);
+    load = dyn_cast<LoadSDNode>(loadSD);
+    other = orOp->getOperand(0).getNode();
+  }
+
+  if (!load)
+    return nullptr;
+
+  unsigned shiftedBytes = 0;
+  if ((other->getOpcode() == ISD::SHL) &&
+      (other->getOperand(0).getNode() == load) &&
+      isa<ConstantSDNode>(other->getOperand(1)))
+    shiftedBytes = cast<ConstantSDNode>(other->getOperand(1).getNode())
+                       ->getAPIntValue()
+                       .getSExtValue() /
+                   8;
+  // TODO: Accept other shifting operations such as srl, sra. Can use a negative
+  // value for shiftedBytes
+
+  unsigned storeMemSz = storeOp->getMemoryVT().getStoreSize();
+  // For now we only accept chains that moves half of the loaded value to the
+  // other half.
+  if (2 * shiftedBytes != storeMemSz)
+    return nullptr;
+
+  const SDValue loadPtr = load->getBasePtr();
+  SDValue Ptr = storeOp->getBasePtr();
+  // TODO: Detect when both LOAD and STORE memory addresses are ADD instructions
+  // but  with known difference
+  bool samePtr = loadPtr == Ptr;
+  if (!samePtr && (loadPtr.getOpcode() != ISD::ADD))
+    return nullptr;
+
+  // Detect if we are moving M[A+k] to M[A]:
+  if (!((loadPtr.getOperand(0) == Ptr) || (loadPtr.getOperand(1) == Ptr)))
+    return nullptr;
+
+  ConstantSDNode *offset = dyn_cast<ConstantSDNode>(loadPtr.getOperand(1));
+  if (!offset)
+    offset = dyn_cast<ConstantSDNode>(loadPtr.getOperand(0));
+
+  if (!offset)
+    return nullptr;
+
+  long loadByteOffset = offset->getAPIntValue().getSExtValue();
+  if (loadByteOffset <
+      0) // TODO: Accept negative offsets. How othen does it happen?
+    return nullptr;
+
+  unsigned loadMemSz = load->getMemoryVT().getStoreSize();
+  bool upperHalfLoad =
+      ((loadByteOffset == loadMemSz) && (2 * loadMemSz == storeMemSz));
+
+  if (!(upperHalfLoad || samePtr))
+    return nullptr;
+
+  if (samePtr) {
+    // TODO: Store the lower loaded value to the upper half
+    if ((loadMemSz == storeMemSz) && (load->use_size() == 2)) {
+      DEBUG(dbgs() << "Reduce load width\n");
+    }
+    DEBUG(dbgs() << "Move lower to upper half\n");
+  }
+  DEBUG(dbgs() << "Reduce store width to half width!!!!");
+  return ShrinkLoadReplaceStoreWithStore({loadMemSz, 0}, loadSD, storeOp, this);
+}
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
@@ -12843,14 +13079,23 @@
 
   SDValue Chain = ST->getChain();
   SDValue Value = ST->getValue();
-  SDValue Ptr   = ST->getBasePtr();
+  SDValue Ptr = ST->getBasePtr();
   EVT VT = Value.getValueType();
 
-  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+  if (VT.isVector() || !Value.hasOneUse())
     return SDValue();
 
   unsigned Opc = Value.getOpcode();
 
+  if (Opc == ISD::OR) {
+    if (SDNode *NewSt =
+            ShrinkLoadShiftOrStoreWithLoadNewStore(ST, Value.getNode()))
+      return SDValue(NewSt, 0);
+  }
+
+  if (ST->isTruncatingStore())
+    return SDValue();
+
   // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
   // is a byte mask indicating a consecutive number of bytes, check to see if
   // Y is known to provide just those bytes.  If so, we try to replace the
Index: test/CodeGen/ARM/stld-width-reduction1.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/stld-width-reduction1.ll
@@ -0,0 +1,33 @@
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-arm-none-eabi"
+; RUN: llc -mtriple=arm %s -o - | FileCheck %s
+; CHECK-LABEL: test_1x2:
+; CHECK: %bb.0:
+; CHECK: ldrh
+; CHECK-NEXT: strb
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_1x2(i16* nocapture, i32) local_unnamed_addr #0 {
+  %3 = getelementptr inbounds i16, i16* %0, i32 %1
+  %4 = load i16, i16* %3, align 2, !tbaa !3
+  %5 = and i16 %4, 255
+  %6 = shl i16 %4, 8
+  %7 = or i16 %5, %6
+  store i16 %7, i16* %3, align 2, !tbaa !3
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 7.0.0 (trunk 331513)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"short", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
Index: test/CodeGen/ARM/stld-width-reduction2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/stld-width-reduction2.ll
@@ -0,0 +1,34 @@
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-arm-none-eabi"
+
+; RUN: llc -mtriple=arm %s -o - | FileCheck %s
+; CHECK-LABEL: test_1x2p1:
+; CHECK: %bb.0:
+; CHECK: ldrb
+; CHECK-NEXT: strb
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_1x2p1(i16* nocapture, i32) local_unnamed_addr #0 {
+  %3 = getelementptr inbounds i16, i16* %0, i32 %1
+  %4 = load i16, i16* %3, align 2, !tbaa !3
+  %5 = and i16 %4, -256
+  %6 = lshr i16 %4, 8
+  %7 = or i16 %5, %6
+  store i16 %7, i16* %3, align 2, !tbaa !3
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 7.0.0 (trunk 331513)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"short", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
Index: test/CodeGen/ARM/stld-width-reduction3.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/stld-width-reduction3.ll
@@ -0,0 +1,35 @@
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-arm-none-eabi"
+
+; RUN: llc -mtriple=arm %s -o - | FileCheck %s
+; CHECK-LABEL: test_1x4p1:
+; CHECK: ldrb
+; CHECK-NEXT: orr
+; CHECK-NEXT: str
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_1x4p1(i32* nocapture, i32) local_unnamed_addr #0 {
+  %3 = getelementptr inbounds i32, i32* %0, i32 %1
+  %4 = load i32, i32* %3, align 4, !tbaa !7
+  %5 = and i32 %4, 65280
+  %6 = lshr i32 %4, 8
+  %7 = and i32 %6, 255
+  %8 = or i32 %7, %5
+  store i32 %8, i32* %3, align 4, !tbaa !7
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 7.0.0 (trunk 331513)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"short", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
Index: test/CodeGen/ARM/stld-width-reduction4.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/stld-width-reduction4.ll
@@ -0,0 +1,36 @@
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t-arm-none-eabi"
+
+; RUN: llc -mtriple=arm %s -o - | FileCheck %s
+; CHECK-LABEL: test_1x4p1_shl:
+; CHECK: ldrb
+; CHECK-NEXT: lsl
+; CHECK-NEXT: orr
+; CHECK-NEXT: str
+
+; Function Attrs: norecurse nounwind
+define dso_local void @test_1x4p1_shl(i32* nocapture, i32) local_unnamed_addr #0 {
+  %3 = getelementptr inbounds i32, i32* %0, i32 %1
+  %4 = load i32, i32* %3, align 4, !tbaa !7
+  %5 = and i32 %4, 65280
+  %6 = shl i32 %4, 8
+  %7 = and i32 %6, 16711680
+  %8 = or i32 %7, %5
+  store i32 %8, i32* %3, align 4, !tbaa !7
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 7.0.0 (trunk 331513)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"short", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
Index: test/CodeGen/X86/fp128-i128.ll
===================================================================
--- test/CodeGen/X86/fp128-i128.ll
+++ test/CodeGen/X86/fp128-i128.ll
@@ -45,13 +45,13 @@
 ; CHECK-LABEL: TestUnionLD1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF
-; CHECK-NEXT:    andq %rdi, %rcx
-; CHECK-NEXT:    movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
-; CHECK-NEXT:    andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    shlq	$48, %rax
+; CHECK-NEXT:    movq	-24(%rsp), %rcx
+; CHECK-NEXT:    movabsq	$281474976710655, %rdx  # imm = 0xFFFFFFFFFFFF
+; CHECK-NEXT:    andq	%rdi, %rdx
+; CHECK-NEXT:    orq	%rax, %rdx
+; CHECK-NEXT:    movq	%rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    jmp foo # TAILCALL
Index: test/CodeGen/X86/pr32329.ll
===================================================================
--- test/CodeGen/X86/pr32329.ll
+++ test/CodeGen/X86/pr32329.ll
@@ -29,67 +29,67 @@
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl obj, %edx
-; X86-NEXT:    movsbl var_27, %eax
-; X86-NEXT:    movzwl var_2, %esi
-; X86-NEXT:    movl var_310, %ecx
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    addl var_24, %ecx
-; X86-NEXT:    andl $4194303, %edx # imm = 0x3FFFFF
-; X86-NEXT:    leal (%edx,%edx), %ebx
-; X86-NEXT:    subl %eax, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    subl %esi, %edi
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl $-1437483407, %ecx # imm = 0xAA51BE71
-; X86-NEXT:    movl $9, %esi
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    shldl %cl, %esi, %ebp
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %esi, %ebp
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovnel %ecx, %esi
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %esi, var_50
-; X86-NEXT:    setge var_205
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    movb %bl, var_218
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movsbl	var_27, %eax
+; X86-NEXT:    movzwl	var_2, %esi
+; X86-NEXT:    movl	var_310, %ecx
+; X86-NEXT:    imull	%eax, %ecx
+; X86-NEXT:    addl	var_24, %ecx
+; X86-NEXT:    movl	$4194303, %edi          # imm = 0x3FFFFF
+; X86-NEXT:    andl	obj, %edi
+; X86-NEXT:    leal	(%edi,%edi), %edx
+; X86-NEXT:    subl	%eax, %edx
+; X86-NEXT:    movl	%edx, %ebx
+; X86-NEXT:    subl	%esi, %ebx
+; X86-NEXT:    imull	%ebx, %ecx
+; X86-NEXT:    addl	$-1437483407, %ecx      # imm = 0xAA51BE71
+; X86-NEXT:    movl	$9, %esi
+; X86-NEXT:    xorl	%ebp, %ebp
+; X86-NEXT:    shldl	%cl, %esi, %ebp
+; X86-NEXT:    shll	%cl, %esi
+; X86-NEXT:    testb	$32, %cl
+; X86-NEXT:    cmovnel	%esi, %ebp
+; X86-NEXT:    movl	$0, %ecx
+; X86-NEXT:    cmovnel	%ecx, %esi
+; X86-NEXT:    cmpl	%edi, %ebx
+; X86-NEXT:    movl	%ebp, var_50+4
+; X86-NEXT:    movl	%esi, var_50
+; X86-NEXT:    setge	var_205
+; X86-NEXT:    imull	%eax, %edx
+; X86-NEXT:    movb	%dl, var_218
+; X86-NEXT:    popl	%esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    popl %edi
+; X86-NEXT:    popl	%edi
 ; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl	%ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    popl	%ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl {{.*}}(%rip), %eax
-; X64-NEXT:    movsbl {{.*}}(%rip), %r9d
-; X64-NEXT:    movzwl {{.*}}(%rip), %r8d
-; X64-NEXT:    movl {{.*}}(%rip), %ecx
-; X64-NEXT:    imull %r9d, %ecx
-; X64-NEXT:    addl {{.*}}(%rip), %ecx
-; X64-NEXT:    andl $4194303, %eax # imm = 0x3FFFFF
-; X64-NEXT:    leal (%rax,%rax), %edi
-; X64-NEXT:    subl %r9d, %edi
-; X64-NEXT:    movl %edi, %esi
-; X64-NEXT:    subl %r8d, %esi
-; X64-NEXT:    imull %esi, %ecx
-; X64-NEXT:    addl $-1437483407, %ecx # imm = 0xAA51BE71
-; X64-NEXT:    movl $9, %edx
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq %rdx, {{.*}}(%rip)
-; X64-NEXT:    cmpl %eax, %esi
-; X64-NEXT:    setge {{.*}}(%rip)
-; X64-NEXT:    imull %r9d, %edi
-; X64-NEXT:    movb %dil, {{.*}}(%rip)
-; X64-NEXT:    retq
+; X64-NEXT:    	movsbl	var_27(%rip), %r9d
+; X64-NEXT:    	movzwl	var_2(%rip), %r8d
+; X64-NEXT:    	movl	var_310(%rip), %ecx
+; X64-NEXT:    	imull	%r9d, %ecx
+; X64-NEXT:    	addl	var_24(%rip), %ecx
+; X64-NEXT:    	movl	$4194303, %esi          # imm = 0x3FFFFF
+; X64-NEXT:    	andl	obj(%rip), %esi
+; X64-NEXT:    	leal	(%rsi,%rsi), %edi
+; X64-NEXT:    	subl	%r9d, %edi
+; X64-NEXT:    	movl	%edi, %edx
+; X64-NEXT:    	subl	%r8d, %edx
+; X64-NEXT:    	imull	%edx, %ecx
+; X64-NEXT:    	addl	$-1437483407, %ecx      # imm = 0xAA51BE71
+; X64-NEXT:    	movl	$9, %eax
+; X64-NEXT:                                            # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    	shlq	%cl, %rax
+; X64-NEXT:    	movq	%rax, var_50(%rip)
+; X64-NEXT:    	cmpl	%esi, %edx
+; X64-NEXT:    	setge	var_205(%rip)
+; X64-NEXT:    	imull	%r9d, %edi
+; X64-NEXT:    	movb	%dil, var_218(%rip)
+; X64-NEXT:    	retq
   entry:
   %bf.load = load i32, i32* bitcast (%struct.AA* @obj to i32*), align 8
   %bf.clear = shl i32 %bf.load, 1
Index: test/CodeGen/X86/pr32588.ll
===================================================================
--- test/CodeGen/X86/pr32588.ll
+++ test/CodeGen/X86/pr32588.ll
@@ -4,10 +4,10 @@
 @b = external local_unnamed_addr global i32, align 4
 @d = external local_unnamed_addr global i32, align 4
 
-; CHECK: cmpl    $1, c(%rip)
-; CHECK-NEXT: sbbl    %eax, %eax
-; CHECK-NEXT: andl    $1, %eax
-; CHECK-NEXT: movl    %eax, d(%rip)
+; CHECK: xorl	%eax, %eax
+; CHECK-NEXT: cmpl	$0, c(%rip)
+; CHECK-NEXT: sete	%al
+; CHECK-NEXT: movl	%eax, d(%rip)
 ; CHECK-NEXT: retq
 
 define void @fn1() {