Index: include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- include/llvm/CodeGen/SelectionDAG.h
+++ include/llvm/CodeGen/SelectionDAG.h
@@ -1380,15 +1380,20 @@
   /// every vector element.
   /// Targets can implement the computeKnownBitsForTargetNode method in the
   /// TargetLowering class to allow target nodes to be understood.
-  void computeKnownBits(SDValue Op, KnownBits &Known, unsigned Depth = 0) const;
+  void
+  computeKnownBits(SDValue Op, KnownBits &Known,
+                   SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads = nullptr,
+                   unsigned Depth = 0) const;
 
   /// Determine which bits of Op are known to be either zero or one and return
   /// them in Known. The DemandedElts argument allows us to only collect the
   /// known bits that are shared by the requested vector elements.
   /// Targets can implement the computeKnownBitsForTargetNode method in the
   /// TargetLowering class to allow target nodes to be understood.
-  void computeKnownBits(SDValue Op, KnownBits &Known, const APInt &DemandedElts,
-                        unsigned Depth = 0) const;
+  void
+  computeKnownBits(SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+                   SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads = nullptr,
+                   unsigned Depth = 0) const;
 
   /// Used to represent the possible overflow behavior of an operation.
   /// Never: the operation cannot overflow.
Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -2741,11 +2741,10 @@
   /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
   /// argument allows us to only collect the known bits that are shared by the
   /// requested vector elements.
-  virtual void computeKnownBitsForTargetNode(const SDValue Op,
-                                             KnownBits &Known,
-                                             const APInt &DemandedElts,
-                                             const SelectionDAG &DAG,
-                                             unsigned Depth = 0) const;
+  virtual void computeKnownBitsForTargetNode(
+      const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+      const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+      unsigned Depth = 0) const;
 
   /// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
   /// Default implementation computes low bits based on alignment
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2114,7 +2114,7 @@
 bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
                                      unsigned Depth) const {
   KnownBits Known;
-  computeKnownBits(Op, Known, Depth);
+  computeKnownBits(Op, Known, nullptr, Depth);
   return Mask.isSubsetOf(Known.Zero);
 }
 
@@ -2157,21 +2157,25 @@
 /// Determine which bits of Op are known to be either zero or one and return
 /// them in Known. For vectors, the known bits are those that are shared by
 /// every vector element.
-void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
-                                    unsigned Depth) const {
+void SelectionDAG::computeKnownBits(
+    SDValue Op, KnownBits &Known,
+    SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads, unsigned Depth) const {
   EVT VT = Op.getValueType();
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
-  computeKnownBits(Op, Known, DemandedElts, Depth);
+  computeKnownBits(Op, Known, DemandedElts, AnyToZeroExtLoads, Depth);
 }
 
 /// Determine which bits of Op are known to be either zero or one and return
 /// them in Known. The DemandedElts argument allows us to only collect the known
-/// bits that are shared by the requested vector elements.
-void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
-                                    const APInt &DemandedElts,
-                                    unsigned Depth) const {
+/// bits that are shared by the requested vector elements. AnyToZeroExtLoads
+/// optionally allows anyext loads to be treated as zeroext (top bits zero) and
+/// returned in the SmallPtrSet. They should be converted to zeroext loads for
+/// the returned KnownBits to be valid.
+void SelectionDAG::computeKnownBits(
+    SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads, unsigned Depth) const {
   unsigned BitWidth = Op.getScalarValueSizeInBits();
 
   Known = KnownBits(BitWidth);   // Don't know anything.
@@ -2210,7 +2214,7 @@
         continue;
 
       SDValue SrcOp = Op.getOperand(i);
-      computeKnownBits(SrcOp, Known2, Depth + 1);
+      computeKnownBits(SrcOp, Known2, AnyToZeroExtLoads, Depth + 1);
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
       if (SrcOp.getValueSizeInBits() != BitWidth) {
@@ -2257,7 +2261,7 @@
     // Known bits are the values that are shared by every demanded element.
     if (!!DemandedLHS) {
       SDValue LHS = Op.getOperand(0);
-      computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1);
+      computeKnownBits(LHS, Known2, DemandedLHS, AnyToZeroExtLoads, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
@@ -2266,7 +2270,7 @@
       break;
     if (!!DemandedRHS) {
       SDValue RHS = Op.getOperand(1);
-      computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1);
+      computeKnownBits(RHS, Known2, DemandedRHS, AnyToZeroExtLoads, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
@@ -2283,7 +2287,8 @@
       DemandedSub = DemandedSub.trunc(NumSubVectorElts);
       if (!!DemandedSub) {
         SDValue Sub = Op.getOperand(i);
-        computeKnownBits(Sub, Known2, DemandedSub, Depth + 1);
+        computeKnownBits(Sub, Known2, DemandedSub, AnyToZeroExtLoads,
+                         Depth + 1);
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
@@ -2306,22 +2311,24 @@
       uint64_t Idx = SubIdx->getZExtValue();
       APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
       if (!!DemandedSubElts) {
-        computeKnownBits(Sub, Known, DemandedSubElts, Depth + 1);
+        computeKnownBits(Sub, Known, DemandedSubElts, AnyToZeroExtLoads,
+                         Depth + 1);
         if (Known.isUnknown())
           break; // early-out.
       }
       APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
       APInt DemandedSrcElts = DemandedElts & ~SubMask;
       if (!!DemandedSrcElts) {
-        computeKnownBits(Src, Known2, DemandedSrcElts, Depth + 1);
+        computeKnownBits(Src, Known2, DemandedSrcElts, AnyToZeroExtLoads,
+                         Depth + 1);
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
     } else {
-      computeKnownBits(Sub, Known, Depth + 1);
+      computeKnownBits(Sub, Known, AnyToZeroExtLoads, Depth + 1);
       if (Known.isUnknown())
         break; // early-out.
-      computeKnownBits(Src, Known2, Depth + 1);
+      computeKnownBits(Src, Known2, AnyToZeroExtLoads, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
@@ -2337,9 +2344,9 @@
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
       APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
-      computeKnownBits(Src, Known, DemandedSrc, Depth + 1);
+      computeKnownBits(Src, Known, DemandedSrc, AnyToZeroExtLoads, Depth + 1);
     } else {
-      computeKnownBits(Src, Known, Depth + 1);
+      computeKnownBits(Src, Known, AnyToZeroExtLoads, Depth + 1);
     }
     break;
   }
@@ -2354,7 +2361,7 @@
 
     // Fast handling of 'identity' bitcasts.
     if (BitWidth == SubBitWidth) {
-      computeKnownBits(N0, Known, DemandedElts, Depth + 1);
+      computeKnownBits(N0, Known, DemandedElts, AnyToZeroExtLoads, Depth + 1);
       break;
     }
 
@@ -2378,7 +2385,7 @@
           SubDemandedElts.setBit(i * SubScale);
 
       for (unsigned i = 0; i != SubScale; ++i) {
-        computeKnownBits(N0, Known2, SubDemandedElts.shl(i),
+        computeKnownBits(N0, Known2, SubDemandedElts.shl(i), AnyToZeroExtLoads,
                          Depth + 1);
         Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * i);
         Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i);
@@ -2398,7 +2405,8 @@
         if (DemandedElts[i])
           SubDemandedElts.setBit(i / SubScale);
 
-      computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1);
+      computeKnownBits(N0, Known2, SubDemandedElts, AnyToZeroExtLoads,
+                       Depth + 1);
 
       Known.Zero.setAllBits(); Known.One.setAllBits();
       for (unsigned i = 0; i != NumElts; ++i)
@@ -2415,8 +2423,10 @@
   }
   case ISD::AND:
     // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     Known.One &= Known2.One;
@@ -2424,8 +2434,10 @@
     Known.Zero |= Known2.Zero;
     break;
   case ISD::OR:
-    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     Known.Zero &= Known2.Zero;
@@ -2433,8 +2445,10 @@
     Known.One |= Known2.One;
     break;
   case ISD::XOR: {
-    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
@@ -2444,8 +2458,10 @@
     break;
   }
   case ISD::MUL: {
-    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // If low bits are zero in either operand, output low known-0 bits.
     // Also compute a conservative estimate for high known-0 bits.
@@ -2466,10 +2482,12 @@
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     unsigned LeadZ = Known2.countMinLeadingZeros();
 
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
     if (RHSMaxLeadingZeros != BitWidth)
       LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
@@ -2479,22 +2497,26 @@
   }
   case ISD::SELECT:
   case ISD::VSELECT:
-    computeKnownBits(Op.getOperand(2), Known, DemandedElts, Depth+1);
+    computeKnownBits(Op.getOperand(2), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth+1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     break;
   case ISD::SELECT_CC:
-    computeKnownBits(Op.getOperand(3), Known, DemandedElts, Depth+1);
+    computeKnownBits(Op.getOperand(3), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
-    computeKnownBits(Op.getOperand(2), Known2, DemandedElts, Depth+1);
+    computeKnownBits(Op.getOperand(2), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
@@ -2523,7 +2545,8 @@
     break;
   case ISD::SHL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
-      computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                       Depth + 1);
       unsigned Shift = ShAmt->getZExtValue();
       Known.Zero <<= Shift;
       Known.One <<= Shift;
@@ -2533,7 +2556,8 @@
     break;
   case ISD::SRL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
-      computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                       Depth + 1);
       unsigned Shift = ShAmt->getZExtValue();
       Known.Zero.lshrInPlace(Shift);
       Known.One.lshrInPlace(Shift);
@@ -2562,7 +2586,8 @@
     break;
   case ISD::SRA:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
-      computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                       Depth + 1);
       unsigned Shift = ShAmt->getZExtValue();
       // Sign extend known zero/one bit (else is unknown).
       Known.Zero.ashrInPlace(Shift);
@@ -2586,7 +2611,8 @@
     if (NewBits.getBoolValue())
       InputDemandedBits |= InSignMask;
 
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known.One &= InputDemandedBits;
     Known.Zero &= InputDemandedBits;
 
@@ -2606,7 +2632,8 @@
   }
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: {
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     // If we have a known 1, its position is our upper bound.
     unsigned PossibleTZ = Known2.countMaxTrailingZeros();
     unsigned LowBits = Log2_32(PossibleTZ) + 1;
@@ -2615,7 +2642,8 @@
   }
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF: {
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     // If we have a known 1, its position is our upper bound.
     unsigned PossibleLZ = Known2.countMaxLeadingZeros();
     unsigned LowBits = Log2_32(PossibleLZ) + 1;
@@ -2623,7 +2651,8 @@
     break;
   }
   case ISD::CTPOP: {
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     // If we know some of the bits are zero, they can't be one.
     unsigned PossibleOnes = Known2.countMaxPopulation();
     Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
@@ -2636,6 +2665,14 @@
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
       Known.Zero.setBitsFrom(MemBits);
+    // If this is a EXTLoad and we are passed AnyToZeroExtLoads, treat
+    // the load as zero extended.
+    } else if (AnyToZeroExtLoads && ISD::isEXTLoad(Op.getNode()) &&
+               Op.getResNo() == 0) {
+      EVT VT = LD->getMemoryVT();
+      unsigned MemBits = VT.getScalarSizeInBits();
+      Known.Zero.setBitsFrom(MemBits);
+      AnyToZeroExtLoads->insert(LD);
     } else if (const MDNode *Ranges = LD->getRanges()) {
       if (LD->getExtensionType() == ISD::NON_EXTLOAD)
         computeKnownBitsFromRangeMetadata(*Ranges, Known);
@@ -2645,40 +2682,44 @@
   case ISD::ZERO_EXTEND_VECTOR_INREG: {
     EVT InVT = Op.getOperand(0).getValueType();
     APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
-    computeKnownBits(Op.getOperand(0), Known, InDemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, InDemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known = Known.zext(BitWidth);
     Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
     break;
   }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known = Known.zext(BitWidth);
     Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
     break;
   }
   // TODO ISD::SIGN_EXTEND_VECTOR_INREG
   case ISD::SIGN_EXTEND: {
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     // If the sign bit is known to be zero or one, then sext will extend
     // it to the top bits, else it will just zext.
     Known = Known.sext(BitWidth);
     break;
   }
   case ISD::ANY_EXTEND: {
-    computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    computeKnownBits(Op.getOperand(0), Known, AnyToZeroExtLoads, Depth + 1);
     Known = Known.zext(BitWidth);
     break;
   }
   case ISD::TRUNCATE: {
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known = Known.trunc(BitWidth);
     break;
   }
   case ISD::AssertZext: {
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
-    computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    computeKnownBits(Op.getOperand(0), Known, AnyToZeroExtLoads, Depth + 1);
     Known.Zero |= (~InMask);
     Known.One  &= (~Known.Zero);
     break;
@@ -2709,7 +2750,7 @@
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
         computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
-                         Depth + 1);
+                         AnyToZeroExtLoads, Depth + 1);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -2725,12 +2766,14 @@
     // If low bits are know to be zero in both operands, then we know they are
     // going to be 0 in the result. Both addition and complement operations
     // preserve the low zero bits.
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     unsigned KnownZeroLow = Known2.countMinTrailingZeros();
     if (KnownZeroLow == 0)
       break;
 
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
     Known.Zero.setLowBits(KnownZeroLow);
     break;
@@ -2757,11 +2800,12 @@
     // known to be clear. For example, if one input has the top 10 bits clear
     // and the other has the top 8 bits clear, we know the top 7 bits of the
     // output must be clear.
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
     unsigned KnownZeroLow = Known2.countMinTrailingZeros();
 
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
                      Depth + 1);
     KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
     KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
@@ -2786,7 +2830,8 @@
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+        computeKnownBits(Op.getOperand(0), Known2, DemandedElts,
+                         AnyToZeroExtLoads, Depth + 1);
 
         // The low bits of the first operand are unchanged by the srem.
         Known.Zero = Known2.Zero & LowBits;
@@ -2810,7 +2855,8 @@
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+        computeKnownBits(Op.getOperand(0), Known2, DemandedElts,
+                         AnyToZeroExtLoads, Depth + 1);
 
         // The upper bits are all zero, the lower ones are unchanged.
         Known.Zero = Known2.Zero | ~LowBits;
@@ -2821,8 +2867,10 @@
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     uint32_t Leaders =
         std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
@@ -2831,7 +2879,7 @@
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
-    computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    computeKnownBits(Op.getOperand(0), Known, AnyToZeroExtLoads, Depth + 1);
     const unsigned Index = Op.getConstantOperandVal(1);
     const unsigned BitWidth = Op.getValueSizeInBits();
 
@@ -2859,10 +2907,10 @@
       // If we know the element index, just demand that vector element.
       unsigned Idx = ConstEltNo->getZExtValue();
       APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
-      computeKnownBits(InVec, Known, DemandedElt, Depth + 1);
+      computeKnownBits(InVec, Known, DemandedElt, AnyToZeroExtLoads, Depth + 1);
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
-      computeKnownBits(InVec, Known, Depth + 1);
+      computeKnownBits(InVec, Known, AnyToZeroExtLoads, Depth + 1);
     }
     if (BitWidth > EltBitWidth)
       Known = Known.zext(BitWidth);
@@ -2882,7 +2930,7 @@
 
       // If we demand the inserted element then add its common known bits.
       if (DemandedElts[EltIdx]) {
-        computeKnownBits(InVal, Known2, Depth + 1);
+        computeKnownBits(InVal, Known2, AnyToZeroExtLoads, Depth + 1);
         Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
         Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
       }
@@ -2891,33 +2939,37 @@
       // that we don't demand the inserted element.
       APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
       if (!!VectorElts) {
-        computeKnownBits(InVec, Known2, VectorElts, Depth + 1);
+        computeKnownBits(InVec, Known2, VectorElts, AnyToZeroExtLoads,
+                         Depth + 1);
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
-      computeKnownBits(InVec, Known, Depth + 1);
-      computeKnownBits(InVal, Known2, Depth + 1);
+      computeKnownBits(InVec, Known, AnyToZeroExtLoads, Depth + 1);
+      computeKnownBits(InVal, Known2, AnyToZeroExtLoads, Depth + 1);
       Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
       Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
     }
     break;
   }
   case ISD::BITREVERSE: {
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known.Zero = Known2.Zero.reverseBits();
     Known.One = Known2.One.reverseBits();
     break;
   }
   case ISD::BSWAP: {
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known.Zero = Known2.Zero.byteSwap();
     Known.One = Known2.One.byteSwap();
     break;
   }
   case ISD::ABS: {
-    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // If the source's MSB is zero then we know the rest of the bits already.
     if (Known2.isNonNegative()) {
@@ -2936,8 +2988,10 @@
     break;
   }
   case ISD::UMIN: {
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
 
     // UMIN - we know that the result will have the maximum of the
     // known zero leading bits of the inputs.
@@ -2950,9 +3004,10 @@
     break;
   }
   case ISD::UMAX: {
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts,
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
                      Depth + 1);
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
     // UMAX - we know that the result will have the maximum of the
     // known one leading bits of the inputs.
@@ -2996,9 +3051,11 @@
     }
 
     // Fallback - just get the shared known bits of the operands.
-    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     if (Known.isUnknown()) break; // Early-out
-    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, AnyToZeroExtLoads,
+                     Depth + 1);
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     break;
@@ -3016,7 +3073,8 @@
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
+    TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this,
+                                       AnyToZeroExtLoads, Depth);
     break;
   }
 
@@ -3360,7 +3418,7 @@
     if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         KnownBits Known;
-        computeKnownBits(Op.getOperand(0), Known, Depth+1);
+        computeKnownBits(Op.getOperand(0), Known, nullptr, Depth + 1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
@@ -3385,7 +3443,7 @@
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
       if (CLHS->isNullValue()) {
         KnownBits Known;
-        computeKnownBits(Op.getOperand(1), Known, Depth+1);
+        computeKnownBits(Op.getOperand(1), Known, nullptr, Depth + 1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
@@ -3551,7 +3609,7 @@
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   KnownBits Known;
-  computeKnownBits(Op, Known, DemandedElts, Depth);
+  computeKnownBits(Op, Known, DemandedElts, nullptr, Depth);
 
   APInt Mask;
   if (Known.isNonNegative()) {        // sign bit is 0
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -529,7 +529,7 @@
     if (Depth != 0) {
       // If not at the root, Just compute the Known bits to
       // simplify things downstream.
-      TLO.DAG.computeKnownBits(Op, Known, Depth);
+      TLO.DAG.computeKnownBits(Op, Known, nullptr, Depth);
       return false;
     }
     // If this is the root being simplified, allow it to have multiple uses,
@@ -580,7 +580,7 @@
       SDValue Op0 = Op.getOperand(0);
       KnownBits LHSKnown;
       // Do not increment Depth here; that can cause an infinite loop.
-      TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth);
+      TLO.DAG.computeKnownBits(Op0, LHSKnown, nullptr, Depth);
       // If the LHS already has zeros where RHSC does, this 'and' is dead.
       if ((LHSKnown.Zero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
         return TLO.CombineTo(Op, Op0);
@@ -1204,7 +1204,7 @@
     // If this is a bitcast, let computeKnownBits handle it.  Only do this on a
     // recursive call where Known may be useful to the caller.
     if (Depth > 0) {
-      TLO.DAG.computeKnownBits(Op, Known, Depth);
+      TLO.DAG.computeKnownBits(Op, Known, nullptr, Depth);
       return false;
     }
     break;
@@ -1257,7 +1257,7 @@
   }
   default:
     // Just use computeKnownBits to compute output bits.
-    TLO.DAG.computeKnownBits(Op, Known, Depth);
+    TLO.DAG.computeKnownBits(Op, Known, nullptr, Depth);
     break;
   }
 
@@ -1569,11 +1569,10 @@
 
 /// Determine which of the bits specified in Mask are known to be either zero or
 /// one and return them in the Known.
-void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                   KnownBits &Known,
-                                                   const APInt &DemandedElts,
-                                                   const SelectionDAG &DAG,
-                                                   unsigned Depth) const {
+void TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
           Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
           Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
@@ -2638,6 +2637,58 @@
     return N0;
   }
 
+  // Test if both operands are AND's, with the same mask, and the
+  // non-mask bits are the same (often all zero). In this case we
+  // can drop the ands.
+  if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
+      N0.getNode()->hasOneUse() && N1.getNode()->hasOneUse()) {
+    SDValue N0LHS = N0.getOperand(0);
+    SDValue N0RHS = N0.getOperand(1);
+    SDValue N1LHS = N1.getOperand(0);
+    SDValue N1RHS = N1.getOperand(1);
+
+    if (isa<ConstantSDNode>(N0LHS))
+      std::swap(N0LHS, N0RHS);
+    if (isa<ConstantSDNode>(N1LHS))
+      std::swap(N1LHS, N1RHS);
+
+    if (!isa<ConstantSDNode>(N0RHS) || !isa<ConstantSDNode>(N1RHS))
+      return SDValue();
+
+    APInt AndMask = cast<ConstantSDNode>(N0RHS)->getAPIntValue();
+    if (cast<ConstantSDNode>(N1RHS)->getAPIntValue() != AndMask)
+      return SDValue();
+
+    KnownBits KB0, KB1;
+    llvm::SmallPtrSet<LoadSDNode *, 4> AnyToZeroExtLoads;
+    DAG.computeKnownBits(N0LHS, KB0, &AnyToZeroExtLoads);
+    DAG.computeKnownBits(N1LHS, KB1, &AnyToZeroExtLoads);
+
+    // Check we know something about all bits
+    if (!(KB0.Zero | KB0.One | AndMask).isAllOnesValue())
+      return SDValue();
+
+    // All non-mask bits must be the same from N0 and N1.
+    if ((KB0.Zero & ~AndMask) != (KB1.Zero & ~AndMask) ||
+        (KB0.One & ~AndMask) != (KB1.One & ~AndMask))
+      return SDValue();
+
+    // Transform anyext loads -> zeroext loads
+    for (auto *LD : AnyToZeroExtLoads) {
+      SDValue Load = DAG.getExtLoad(
+          ISD::ZEXTLOAD, SDLoc(LD), LD->getValueType(0), LD->getChain(),
+          LD->getBasePtr(), LD->getPointerInfo(), LD->getMemoryVT(),
+          LD->getAlignment(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), Load);
+      if (LD == N0LHS.getNode())
+        N0LHS = Load;
+      if (LD == N1LHS.getNode())
+        N1LHS = Load;
+    }
+
+    return DAG.getSetCC(dl, VT, N0LHS, N1LHS, Cond);
+  }
+
   // Could not fold it.
   return SDValue();
 }
Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -250,10 +250,10 @@
 
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
-                                     const APInt &DemandedElts,
-                                     const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(
+      const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+      const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+      unsigned Depth = 0) const override;
 
   bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
                                     TargetLoweringOpt &TLO) const override;
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -965,15 +965,18 @@
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them Known.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
-    const SDValue Op, KnownBits &Known,
-    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
   case AArch64ISD::CSEL: {
     KnownBits Known2;
-    DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
-    DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(0), Known, AnyToZeroExtLoads,
+                         Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), Known2, AnyToZeroExtLoads,
+                         Depth + 1);
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     break;
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -227,11 +227,10 @@
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op,
-                                     KnownBits &Known,
-                                     const APInt &DemandedElts,
-                                     const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(
+      const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+      const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+      unsigned Depth = 0) const override;
 
   unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
                                            const SelectionDAG &DAG,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4128,8 +4128,9 @@
 }
 
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
-    const SDValue Op, KnownBits &Known,
-    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
 
   Known.resetAll(); // Don't know anything.
 
@@ -4168,8 +4169,10 @@
   case AMDGPUISD::MUL_U24:
   case AMDGPUISD::MUL_I24: {
     KnownBits LHSKnown, RHSKnown;
-    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, AnyToZeroExtLoads,
+                         Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, AnyToZeroExtLoads,
+                         Depth + 1);
 
     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
                       RHSKnown.countMinTrailingZeros();
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -383,11 +383,11 @@
                                     SDValue &Offset, ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
-    void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
-                                       const APInt &DemandedElts,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth) const override;
-
+    void computeKnownBitsForTargetNode(
+        const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+        const SelectionDAG &DAG,
+        SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+        unsigned Depth) const override;
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -13381,11 +13381,10 @@
   return true;
 }
 
-void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      KnownBits &Known,
-                                                      const APInt &DemandedElts,
-                                                      const SelectionDAG &DAG,
-                                                      unsigned Depth) const {
+void ARMTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
   Known.resetAll();
   switch (Op.getOpcode()) {
@@ -13408,12 +13407,13 @@
     break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
-    DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), Known, AnyToZeroExtLoads, Depth + 1);
     if (Known.isUnknown())
       return;
 
     KnownBits KnownRHS;
-    DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(1), KnownRHS, AnyToZeroExtLoads,
+                         Depth + 1);
     Known.Zero &= KnownRHS.Zero;
     Known.One  &= KnownRHS.One;
     return;
@@ -13435,7 +13435,7 @@
   case ARMISD::BFI: {
     // Conservatively, we can recurse down the first operand
     // and just mask out all affected bits.
-    DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(0), Known, AnyToZeroExtLoads, Depth + 1);
 
     // The operand to BFI is already a mask suitable for removing the bits it
     // sets.
Index: lib/Target/Lanai/LanaiISelLowering.h
===================================================================
--- lib/Target/Lanai/LanaiISelLowering.h
+++ lib/Target/Lanai/LanaiISelLowering.h
@@ -106,10 +106,10 @@
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
-                                     const APInt &DemandedElts,
-                                     const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(
+      const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+      const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+      unsigned Depth = 0) const override;
 
 private:
   SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
Index: lib/Target/Lanai/LanaiISelLowering.cpp
===================================================================
--- lib/Target/Lanai/LanaiISelLowering.cpp
+++ lib/Target/Lanai/LanaiISelLowering.cpp
@@ -1503,7 +1503,8 @@
 
 void LanaiTargetLowering::computeKnownBitsForTargetNode(
     const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
-    const SelectionDAG &DAG, unsigned Depth) const {
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
   switch (Op.getOpcode()) {
   default:
@@ -1514,8 +1515,10 @@
     break;
   case LanaiISD::SELECT_CC:
     KnownBits Known2;
-    DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
-    DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(0), Known, AnyToZeroExtLoads,
+                         Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), Known2, AnyToZeroExtLoads,
+                         Depth + 1);
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     break;
Index: lib/Target/PowerPC/PPCISelLowering.h
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.h
+++ lib/Target/PowerPC/PPCISelLowering.h
@@ -659,11 +659,11 @@
     unsigned getRegisterByName(const char* RegName, EVT VT,
                                SelectionDAG &DAG) const override;
 
-    void computeKnownBitsForTargetNode(const SDValue Op,
-                                       KnownBits &Known,
-                                       const APInt &DemandedElts,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth = 0) const override;
+    void computeKnownBitsForTargetNode(
+        const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+        const SelectionDAG &DAG,
+        SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+        unsigned Depth = 0) const override;
 
     unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
 
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12934,11 +12934,10 @@
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      KnownBits &Known,
-                                                      const APInt &DemandedElts,
-                                                      const SelectionDAG &DAG,
-                                                      unsigned Depth) const {
+void PPCTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
Index: lib/Target/Sparc/SparcISelLowering.h
===================================================================
--- lib/Target/Sparc/SparcISelLowering.h
+++ lib/Target/Sparc/SparcISelLowering.h
@@ -65,11 +65,11 @@
     /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
-    void computeKnownBitsForTargetNode(const SDValue Op,
-                                       KnownBits &Known,
-                                       const APInt &DemandedElts,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth = 0) const override;
+    void computeKnownBitsForTargetNode(
+        const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+        const SelectionDAG &DAG,
+        SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+        unsigned Depth = 0) const override;
 
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
Index: lib/Target/Sparc/SparcISelLowering.cpp
===================================================================
--- lib/Target/Sparc/SparcISelLowering.cpp
+++ lib/Target/Sparc/SparcISelLowering.cpp
@@ -1883,12 +1883,10 @@
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
-void SparcTargetLowering::computeKnownBitsForTargetNode
-                                (const SDValue Op,
-                                 KnownBits &Known,
-                                 const APInt &DemandedElts,
-                                 const SelectionDAG &DAG,
-                                 unsigned Depth) const {
+void SparcTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   KnownBits Known2;
   Known.resetAll();
 
@@ -1897,8 +1895,9 @@
   case SPISD::SELECT_ICC:
   case SPISD::SELECT_XCC:
   case SPISD::SELECT_FCC:
-    DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
-    DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(1), Known, AnyToZeroExtLoads, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(0), Known2, AnyToZeroExtLoads,
+                         Depth + 1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
Index: lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.h
+++ lib/Target/SystemZ/SystemZISelLowering.h
@@ -492,11 +492,10 @@
 
   /// Determine which of the bits specified in Mask are known to be either
   /// zero or one and return them in the KnownZero/KnownOne bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op,
-                                     KnownBits &Known,
-                                     const APInt &DemandedElts,
-                                     const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(
+      const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+      const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+      unsigned Depth = 0) const override;
 
   ISD::NodeType getExtendForAtomicOps() const override {
     return ISD::ANY_EXTEND;
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -5576,20 +5576,20 @@
   return SDValue();
 }
 
-void
-SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                     KnownBits &Known,
-                                                     const APInt &DemandedElts,
-                                                     const SelectionDAG &DAG,
-                                                     unsigned Depth) const {
+void SystemZTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
 
   Known.resetAll();
   switch (Op.getOpcode()) {
   case SystemZISD::SELECT_CCMASK: {
     KnownBits TrueKnown(BitWidth), FalseKnown(BitWidth);
-    DAG.computeKnownBits(Op.getOperand(0), TrueKnown, Depth + 1);
-    DAG.computeKnownBits(Op.getOperand(1), FalseKnown, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(0), TrueKnown, AnyToZeroExtLoads,
+                         Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(1), FalseKnown, AnyToZeroExtLoads,
+                         Depth + 1);
     Known.Zero = TrueKnown.Zero & FalseKnown.Zero;
     Known.One = TrueKnown.One & FalseKnown.One;
     break;
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -840,11 +840,11 @@
 
     /// Determine which of the bits specified in Mask are known to be either
     /// zero or one and return them in the KnownZero/KnownOne bitsets.
-    void computeKnownBitsForTargetNode(const SDValue Op,
-                                       KnownBits &Known,
-                                       const APInt &DemandedElts,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth = 0) const override;
+    void computeKnownBitsForTargetNode(
+        const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+        const SelectionDAG &DAG,
+        SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+        unsigned Depth = 0) const override;
 
     /// Determine the number of bits in the operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -28137,11 +28137,10 @@
   return TLO.CombineTo(Op, NewOp);
 }
 
-void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      KnownBits &Known,
-                                                      const APInt &DemandedElts,
-                                                      const SelectionDAG &DAG,
-                                                      unsigned Depth) const {
+void X86TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
@@ -28169,7 +28168,7 @@
     EVT SrcVT = Src.getValueType();
     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
                                             Op.getConstantOperandVal(1));
-    DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
+    DAG.computeKnownBits(Src, Known, DemandedElt, AnyToZeroExtLoads, Depth + 1);
     Known = Known.zextOrTrunc(BitWidth);
     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
     break;
@@ -28182,7 +28181,8 @@
         break;
       }
 
-      DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts,
+                           AnyToZeroExtLoads, Depth + 1);
       unsigned ShAmt = ShiftImm->getZExtValue();
       if (Opc == X86ISD::VSHLI) {
         Known.Zero <<= ShAmt;
@@ -28210,18 +28210,20 @@
 
     Known = KnownBits(InBitWidth);
     APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
-    DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
+    DAG.computeKnownBits(N0, Known, DemandedSrcElts, AnyToZeroExtLoads,
+                         Depth + 1);
     Known = Known.zext(BitWidth);
     Known.Zero.setBitsFrom(InBitWidth);
     break;
   }
   case X86ISD::CMOV: {
-    DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(1), Known, AnyToZeroExtLoads, Depth + 1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
     KnownBits Known2;
-    DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), Known2, AnyToZeroExtLoads,
+                         Depth + 1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
Index: lib/Target/X86/X86InstrCompiler.td
===================================================================
--- lib/Target/X86/X86InstrCompiler.td
+++ lib/Target/X86/X86InstrCompiler.td
@@ -1324,9 +1324,9 @@
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
   KnownBits Known0;
-  CurDAG->computeKnownBits(N->getOperand(0), Known0, 0);
+  CurDAG->computeKnownBits(N->getOperand(0), Known0, nullptr, 0);
   KnownBits Known1;
-  CurDAG->computeKnownBits(N->getOperand(1), Known1, 0);
+  CurDAG->computeKnownBits(N->getOperand(1), Known1, nullptr, 0);
   return (~Known0.Zero & ~Known1.Zero) == 0;
 }]>;
 
Index: lib/Target/XCore/XCoreISelLowering.h
===================================================================
--- lib/Target/XCore/XCoreISelLowering.h
+++ lib/Target/XCore/XCoreISelLowering.h
@@ -200,11 +200,11 @@
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    void computeKnownBitsForTargetNode(const SDValue Op,
-                                       KnownBits &Known,
-                                       const APInt &DemandedElts,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth = 0) const override;
+    void computeKnownBitsForTargetNode(
+        const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+        const SelectionDAG &DAG,
+        SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+        unsigned Depth = 0) const override;
 
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
Index: lib/Target/XCore/XCoreISelLowering.cpp
===================================================================
--- lib/Target/XCore/XCoreISelLowering.cpp
+++ lib/Target/XCore/XCoreISelLowering.cpp
@@ -1819,11 +1819,10 @@
   return SDValue();
 }
 
-void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                        KnownBits &Known,
-                                                        const APInt &DemandedElts,
-                                                        const SelectionDAG &DAG,
-                                                        unsigned Depth) const {
+void XCoreTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, SmallPtrSetImpl<LoadSDNode *> *AnyToZeroExtLoads,
+    unsigned Depth) const {
   Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
Index: test/CodeGen/Thumb/setcc_xor.ll
===================================================================
--- test/CodeGen/Thumb/setcc_xor.ll
+++ test/CodeGen/Thumb/setcc_xor.ll
@@ -4,27 +4,20 @@
 
 define i8 @test1(i8 zeroext %x, i8 zeroext %y) {
 ; CHECK-V6M-LABEL: test1:
-; CHECK-V6M:         movs r2, #255
-; CHECK-V6M-NEXT:    mov r3, r2
-; CHECK-V6M-NEXT:    bics r3, r1
-; CHECK-V6M-NEXT:    bics r2, r0
-; CHECK-V6M-NEXT:    mvns r0, r0
+; CHECK-V6M:         mvns r0, r0
 ; CHECK-V6M-NEXT:    mvns r1, r1
-; CHECK-V6M-NEXT:    cmp r2, r3
+; CHECK-V6M-NEXT:    cmp r0, r1
 ; CHECK-V6M-NEXT:    bls .LBB0_2
 ; CHECK-V6M-NEXT:    mov r0, r1
 ; CHECK-V6M-NEXT:  .LBB0_2:
 ; CHECK-V6M-NEXT:    bx lr
 ;
 ; CHECK-V7M-LABEL: test1:
-; CHECK-V7M:         mvns r1, r1
-; CHECK-V7M-NEXT:    mvns r0, r0
-; CHECK-V7M-NEXT:    uxtb r2, r1
-; CHECK-V7M-NEXT:    uxtb r3, r0
-; CHECK-V7M-NEXT:    cmp r3, r2
+; CHECK-V7M:         mvns r2, r0
+; CHECK-V7M-NEXT:    mvns r0, r1
+; CHECK-V7M-NEXT:    cmp r2, r0
 ; CHECK-V7M-NEXT:    it ls
-; CHECK-V7M-NEXT:    movls r1, r0
-; CHECK-V7M-NEXT:    mov r0, r1
+; CHECK-V7M-NEXT:    movls r0, r2
 ; CHECK-V7M-NEXT:    bx lr
 entry:
   %nx = xor i8 %x, 255
@@ -36,43 +29,37 @@
 
 define void @test2(i8* %X, i8* %Y) {
 ; CHECK-V6M-LABEL: test2:
-; CHECK-V6M:         .save {r4, r5, r7, lr}
-; CHECK-V6M-NEXT:    push {r4, r5, r7, lr}
-; CHECK-V6M-NEXT:    ldrb r2, [r1]
-; CHECK-V6M-NEXT:    movs r4, #255
-; CHECK-V6M-NEXT:    mov r5, r4
-; CHECK-V6M-NEXT:    bics r5, r2
-; CHECK-V6M-NEXT:    ldrb r3, [r0]
-; CHECK-V6M-NEXT:    bics r4, r3
-; CHECK-V6M-NEXT:    mvns r3, r3
+; CHECK-V6M:         .save {r4, lr}
+; CHECK-V6M-NEXT:    push {r4, lr}
+; CHECK-V6M-NEXT:    ldrb r2, [r0]
 ; CHECK-V6M-NEXT:    mvns r2, r2
-; CHECK-V6M-NEXT:    cmp r4, r5
-; CHECK-V6M-NEXT:    mov r4, r3
-; CHECK-V6M-NEXT:    blo .LBB1_2
+; CHECK-V6M-NEXT:    ldrb r3, [r1]
+; CHECK-V6M-NEXT:    mvns r3, r3
+; CHECK-V6M-NEXT:    cmp r2, r3
 ; CHECK-V6M-NEXT:    mov r4, r2
+; CHECK-V6M-NEXT:    blo .LBB1_2
+; CHECK-V6M-NEXT:    mov r4, r3
 ; CHECK-V6M-NEXT:  .LBB1_2:
-; CHECK-V6M-NEXT:    subs r3, r3, r4
-; CHECK-V6M-NEXT:    strb r3, [r0]
-; CHECK-V6M-NEXT:    subs r0, r2, r4
+; CHECK-V6M-NEXT:    subs r2, r2, r4
+; CHECK-V6M-NEXT:    strb r2, [r0]
+; CHECK-V6M-NEXT:    subs r0, r3, r4
 ; CHECK-V6M-NEXT:    strb r0, [r1]
-; CHECK-V6M-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-V6M-NEXT:    pop {r4, pc}
 ;
 ; CHECK-V7M-LABEL: test2:
 ; CHECK-V7M:         .save {r7, lr}
 ; CHECK-V7M-NEXT:    push {r7, lr}
-; CHECK-V7M-NEXT:    ldrb r3, [r0]
-; CHECK-V7M-NEXT:    ldrb r2, [r1]
-; CHECK-V7M-NEXT:    mvns r3, r3
-; CHECK-V7M-NEXT:    mvn.w lr, r2
-; CHECK-V7M-NEXT:    uxtb r2, r3
-; CHECK-V7M-NEXT:    uxtb.w r12, lr
-; CHECK-V7M-NEXT:    cmp r2, r12
-; CHECK-V7M-NEXT:    mov r2, lr
+; CHECK-V7M-NEXT:    ldrb r3, [r1]
+; CHECK-V7M-NEXT:    ldrb r2, [r0]
+; CHECK-V7M-NEXT:    mvn.w lr, r3
+; CHECK-V7M-NEXT:    mvn.w r12, r2
+; CHECK-V7M-NEXT:    cmp r12, lr
+; CHECK-V7M-NEXT:    mov r3, lr
 ; CHECK-V7M-NEXT:    it lo
-; CHECK-V7M-NEXT:    movlo r2, r3
-; CHECK-V7M-NEXT:    subs r3, r3, r2
-; CHECK-V7M-NEXT:    strb r3, [r0]
-; CHECK-V7M-NEXT:    sub.w r0, lr, r2
+; CHECK-V7M-NEXT:    movlo r3, r12
+; CHECK-V7M-NEXT:    sub.w r2, r12, r3
+; CHECK-V7M-NEXT:    strb r2, [r0]
+; CHECK-V7M-NEXT:    sub.w r0, lr, r3
 ; CHECK-V7M-NEXT:    strb r0, [r1]
 ; CHECK-V7M-NEXT:    pop {r7, pc}
 entry:
@@ -92,42 +79,31 @@
 
 define void @testloop(i32 %I, i8* nocapture readonly %A, i8* nocapture %B) {
 ; CHECK-V6M-LABEL: testloop:
-; CHECK-V6M:         .save {r4, r5, r6, r7, lr}
-; CHECK-V6M-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-V6M-NEXT:    .pad #4
-; CHECK-V6M-NEXT:    sub sp, #4
+; CHECK-V6M:         .save {r4, r5, r6, lr}
+; CHECK-V6M-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-V6M-NEXT:    cmp r0, #1
 ; CHECK-V6M-NEXT:    blt .LBB2_6
 ; CHECK-V6M-NEXT:  .LBB2_1:
+; CHECK-V6M-NEXT:    ldrb r3, [r1]
+; CHECK-V6M-NEXT:    mvns r4, r3
 ; CHECK-V6M-NEXT:    ldrb r3, [r1, #2]
-; CHECK-V6M-NEXT:    movs r6, #255
-; CHECK-V6M-NEXT:    mov r5, r6
-; CHECK-V6M-NEXT:    bics r5, r3
-; CHECK-V6M-NEXT:    ldrb r4, [r1]
-; CHECK-V6M-NEXT:    mov r7, r6
-; CHECK-V6M-NEXT:    bics r7, r4
-; CHECK-V6M-NEXT:    mvns r4, r4
 ; CHECK-V6M-NEXT:    mvns r3, r3
-; CHECK-V6M-NEXT:    cmp r7, r5
+; CHECK-V6M-NEXT:    cmp r4, r3
 ; CHECK-V6M-NEXT:    mov r5, r4
 ; CHECK-V6M-NEXT:    blo .LBB2_3
 ; CHECK-V6M-NEXT:    mov r5, r3
 ; CHECK-V6M-NEXT:  .LBB2_3:
-; CHECK-V6M-NEXT:    str r3, [sp]
-; CHECK-V6M-NEXT:    uxtb r3, r5
-; CHECK-V6M-NEXT:    ldrb r7, [r1, #1]
-; CHECK-V6M-NEXT:    bics r6, r7
-; CHECK-V6M-NEXT:    mvns r7, r7
-; CHECK-V6M-NEXT:    cmp r3, r6
+; CHECK-V6M-NEXT:    ldrb r6, [r1, #1]
+; CHECK-V6M-NEXT:    mvns r6, r6
+; CHECK-V6M-NEXT:    cmp r5, r6
 ; CHECK-V6M-NEXT:    blo .LBB2_5
-; CHECK-V6M-NEXT:    mov r5, r7
+; CHECK-V6M-NEXT:    mov r5, r6
 ; CHECK-V6M-NEXT:  .LBB2_5:
 ; CHECK-V6M-NEXT:    strb r5, [r2]
-; CHECK-V6M-NEXT:    subs r3, r4, r5
-; CHECK-V6M-NEXT:    strb r3, [r2, #1]
-; CHECK-V6M-NEXT:    subs r3, r7, r5
-; CHECK-V6M-NEXT:    strb r3, [r2, #2]
-; CHECK-V6M-NEXT:    ldr r3, [sp]
+; CHECK-V6M-NEXT:    subs r4, r4, r5
+; CHECK-V6M-NEXT:    strb r4, [r2, #1]
+; CHECK-V6M-NEXT:    subs r4, r6, r5
+; CHECK-V6M-NEXT:    strb r4, [r2, #2]
 ; CHECK-V6M-NEXT:    subs r3, r3, r5
 ; CHECK-V6M-NEXT:    strb r3, [r2, #3]
 ; CHECK-V6M-NEXT:    adds r2, r2, #4
@@ -135,45 +111,40 @@
 ; CHECK-V6M-NEXT:    subs r0, r0, #1
 ; CHECK-V6M-NEXT:    bne .LBB2_1
 ; CHECK-V6M-NEXT:  .LBB2_6:
-; CHECK-V6M-NEXT:    add sp, #4
-; CHECK-V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-V6M-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-V7M-LABEL: testloop:
-; CHECK-V7M:         .save {r4, r5, r6, r7, lr}
-; CHECK-V7M-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-V7M:         .save {r4, r5, r7, lr}
+; CHECK-V7M-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-V7M-NEXT:    cmp r0, #1
 ; CHECK-V7M-NEXT:    blt .LBB2_2
 ; CHECK-V7M-NEXT:  .LBB2_1:
 ; CHECK-V7M-NEXT:    ldrb.w lr, [r1]
-; CHECK-V7M-NEXT:    ldrb r3, [r1, #2]
+; CHECK-V7M-NEXT:    ldrb r4, [r1, #2]
 ; CHECK-V7M-NEXT:    ldrb.w r12, [r1, #1]
 ; CHECK-V7M-NEXT:    adds r1, #3
-; CHECK-V7M-NEXT:    mvn.w r4, lr
-; CHECK-V7M-NEXT:    mvns r7, r3
-; CHECK-V7M-NEXT:    uxtb r5, r4
-; CHECK-V7M-NEXT:    uxtb r6, r7
-; CHECK-V7M-NEXT:    cmp r5, r6
-; CHECK-V7M-NEXT:    mov r3, r7
-; CHECK-V7M-NEXT:    mvn.w r5, r12
+; CHECK-V7M-NEXT:    mvn.w r5, lr
+; CHECK-V7M-NEXT:    mvn.w lr, r4
+; CHECK-V7M-NEXT:    cmp r5, lr
+; CHECK-V7M-NEXT:    mov r4, lr
+; CHECK-V7M-NEXT:    mvn.w r3, r12
 ; CHECK-V7M-NEXT:    it lo
-; CHECK-V7M-NEXT:    movlo r3, r4
-; CHECK-V7M-NEXT:    uxtb r6, r5
-; CHECK-V7M-NEXT:    uxtb.w lr, r3
-; CHECK-V7M-NEXT:    cmp lr, r6
+; CHECK-V7M-NEXT:    movlo r4, r5
+; CHECK-V7M-NEXT:    cmp r4, r3
 ; CHECK-V7M-NEXT:    it hs
-; CHECK-V7M-NEXT:    movhs r3, r5
+; CHECK-V7M-NEXT:    movhs r4, r3
 ; CHECK-V7M-NEXT:    subs r0, #1
-; CHECK-V7M-NEXT:    sub.w r6, r4, r3
-; CHECK-V7M-NEXT:    strb r3, [r2]
-; CHECK-V7M-NEXT:    strb r6, [r2, #1]
-; CHECK-V7M-NEXT:    sub.w r6, r5, r3
-; CHECK-V7M-NEXT:    strb r6, [r2, #2]
-; CHECK-V7M-NEXT:    sub.w r3, r7, r3
+; CHECK-V7M-NEXT:    sub.w r3, r3, r4
+; CHECK-V7M-NEXT:    strb r4, [r2]
+; CHECK-V7M-NEXT:    sub.w r5, r5, r4
+; CHECK-V7M-NEXT:    strb r5, [r2, #1]
+; CHECK-V7M-NEXT:    strb r3, [r2, #2]
+; CHECK-V7M-NEXT:    sub.w r3, lr, r4
 ; CHECK-V7M-NEXT:    strb r3, [r2, #3]
 ; CHECK-V7M-NEXT:    add.w r2, r2, #4
 ; CHECK-V7M-NEXT:    bne .LBB2_1
 ; CHECK-V7M-NEXT:  .LBB2_2:
-; CHECK-V7M-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-V7M-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %cmp74 = icmp sgt i32 %I, 0
   br i1 %cmp74, label %for.body.preheader, label %for.cond.cleanup