Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -500,6 +500,9 @@
     /// the load would need to be narrowed in order to match.
     bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                           EVT LoadResultTy, EVT &ExtVT, bool &NarrowLoad);
+    bool SearchForNarrowLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads,
+                              ConstantSDNode *Mask);
+    bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);
 
     /// Helper function for MergeConsecutiveStores which merges the
     /// component store chains.
@@ -3730,6 +3733,92 @@
   return true;
 }
 
+bool DAGCombiner::SearchForNarrowLoads(SDNode *N,
+                                       SmallPtrSetImpl<LoadSDNode*> &Loads,
+                                       ConstantSDNode *Mask) {
+  // Recursively search for the operands, looking for loads which can be
+  // narrowed.
+  for (unsigned i = 0; i < N->getNumOperands(); ++i) {
+    SDNode *Op = N->getOperand(i).getNode();
+
+    // Constants should already be fixed up...
+    if (isa<ConstantSDNode>(Op))
+      continue;
+
+    if (!Op->hasOneUse() || Op->getValueType(0).isVector())
+      return false;
+
+    if (auto *Load = dyn_cast<LoadSDNode>(Op)) {
+      EVT ExtVT;
+      bool NarrowLoad = false;
+      if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT,
+                           NarrowLoad)) {
+        if (!NarrowLoad)
+          return false;
+        Loads.insert(Load);
+        continue;
+      } else
+        return false;
+    }
+
+    switch(Op->getOpcode()) {
+    default:
+      return false;
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::AND:
+      SearchForNarrowLoads(Op, Loads, Mask);
+      break;
+    }
+  }
+  return true;
+}
+
+bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
+  // The search function will check whether this const is a suitable mask.
+  auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!Mask)
+    return false;
+
+  // No need to do anything if the and directly uses a load.
+  if (isa<LoadSDNode>(N->getOperand(0)))
+    return false;
+
+  SmallPtrSet<LoadSDNode*, 8> Loads;
+  if (SearchForNarrowLoads(N, Loads, Mask)) {
+    if (Loads.size() == 0)
+      return false;
+
+    for (auto *Load : Loads) {
+      assert(Load->hasOneUse() && "Can't transform this load");
+      SDNode::use_iterator Use = Load->use_begin();
+      SDNode *User = *Use;
+
+      // Create the and which will enable the narrowing.
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), N->getValueType(0),
+                                SDValue(Load, 0), N->getOperand(1));
+
+      SDValue OtherOp = Use.getOperandNo() == 0 ? User->getOperand(1) :
+        User->getOperand(0);
+      // Supported opcodes are commutable.
+      SDValue NewUser = DAG.getNode(User->getOpcode(), SDLoc(User),
+                                    User->getValueType(0), And, OtherOp);
+      DAG.ReplaceAllUsesWith(User, NewUser.getNode());
+      DAG.RemoveDeadNode(User);
+
+      // Reduce the load width, this needs to happen once the original user
+      // is removed because the load can only have one use.
+      SDValue NewLoad = ReduceLoadWidth(And.getNode());
+      assert(NewLoad &&
+             "Shouldn't be masking the load if it can't be narrowed");
+      CombineTo(Load, NewLoad, NewLoad.getValue(1));
+    }
+    DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
+    return true;
+  }
+  return false;
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -3928,6 +4017,16 @@
     }
   }
 
+  if (Level >= AfterLegalizeTypes) {
+    // Attempt to propagate the AND back up to the leaves which, if they're
+    // loads, can be combined to narrow loads and the AND node can be removed.
+    // Perform after legalization so that extend nodes will already be
+    // combined into the loads.
+    if (BackwardsPropagateMask(N, DAG)) {
+      return SDValue(N, 0);
+    }
+  }
+
   if (SDValue Combined = visitANDLike(N0, N1, N))
     return Combined;
 
Index: test/CodeGen/ARM/and-load-combine.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/and-load-combine.ll
@@ -0,0 +1,199 @@
+; RUN: llc -mtriple=armv7 %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv7eb %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv6m %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s
+
+; CHECK-LABEL: cmp_xor8_short_short
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a,
+                                                    i16* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %1 = load i16, i16* %b, align 2
+  %xor2 = xor i16 %1, %0
+  %2 = and i16 %xor2, 255
+  %cmp = icmp eq i16 %2, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_xor8_short_int
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a,
+                                                  i32* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %conv = zext i16 %0 to i32
+  %1 = load i32, i32* %b, align 4
+  %xor = xor i32 %1, %conv
+  %and = and i32 %xor, 255
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_xor8_int_int
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a,
+                                                i32* nocapture readonly %b) {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %xor = xor i32 %1, %0
+  %and = and i32 %xor, 255
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_xor16
+; CHECK: ldrh
+; CHECK: ldrh
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a,
+                                         i32* nocapture readonly %b) {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %xor = xor i32 %1, %0
+  %and = and i32 %xor, 65535
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_or8_short_short
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a,
+                                                   i16* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %1 = load i16, i16* %b, align 2
+  %or2 = or i16 %1, %0
+  %2 = and i16 %or2, 255
+  %cmp = icmp eq i16 %2, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_or8_short_int
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a,
+                                                 i32* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %conv = zext i16 %0 to i32
+  %1 = load i32, i32* %b, align 4
+  %or = or i32 %1, %conv
+  %and = and i32 %or, 255
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_or8_int_int
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a,
+                                               i32* nocapture readonly %b) {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %or = or i32 %1, %0
+  %and = and i32 %or, 255
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_or16
+; CHECK: ldrh
+; CHECK: ldrh
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a,
+                                        i32* nocapture readonly %b) {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %or = or i32 %1, %0
+  %and = and i32 %or, 65535
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_and8_short_short
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a,
+                                                    i16* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %1 = load i16, i16* %b, align 2
+  %and3 = and i16 %0, 255
+  %2 = and i16 %and3, %1
+  %cmp = icmp eq i16 %2, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_and8_short_int
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a,
+                                                  i32* nocapture readonly %b) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %1 = load i32, i32* %b, align 4
+  %2 = and i16 %0, 255
+  %and = zext i16 %2 to i32
+  %and1 = and i32 %1, %and
+  %cmp = icmp eq i32 %and1, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_and8_int_int
+; CHECK: ldrb
+; CHECK: ldrb
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a,
+                                                i32* nocapture readonly %b) {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %and = and i32 %0, 255
+  %and1 = and i32 %and, %1
+  %cmp = icmp eq i32 %and1, 0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: cmp_and16
+; CHECK: ldrh
+; CHECK: ldrh
+; CHECK-NOT: lsl
+; CHECK: bx
+define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a,
+                                         i32* nocapture readonly %b) {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %and = and i32 %0, 65535
+  %and1 = and i32 %and, %1
+  %cmp = icmp eq i32 %and1, 0
+  ret i1 %cmp
+}