Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3842,9 +3842,16 @@
       EVT ExtVT;
       if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
           isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {
-        // Only add this load if we can make it more narrow.
-        if (ExtVT.bitsLT(Load->getMemoryVT()))
+
+        // ZEXTLOAD is already small enough.
+        if (Load->getExtensionType() == ISD::ZEXTLOAD &&
+            ExtVT.bitsGE(Load->getMemoryVT()))
+          continue;
+
+        // Use LE to convert equal sized loads to zext.
+        if (ExtVT.bitsLE(Load->getMemoryVT()))
           Loads.insert(Load);
+
         continue;
       }
       return false;
@@ -3899,11 +3906,13 @@
     if (Loads.size() == 0)
       return false;
 
+    DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
     SDValue MaskOp = N->getOperand(1);
 
     // If it exists, fixup the single node we allow in the tree that needs
     // masking.
     if (FixupNode) {
+      DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
       SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
                                 FixupNode->getValueType(0),
                                 SDValue(FixupNode, 0), MaskOp);
@@ -3922,6 +3931,7 @@
 
     // Create narrow loads.
     for (auto *Load : Loads) {
+      DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
       SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
                                 SDValue(Load, 0), MaskOp);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
Index: test/CodeGen/ARM/and-load-combine.ll
===================================================================
--- test/CodeGen/ARM/and-load-combine.ll
+++ test/CodeGen/ARM/and-load-combine.ll
@@ -852,8 +852,7 @@
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    ldrb r0, [r0]
 ; ARM-NEXT:    uxtb r2, r2
-; ARM-NEXT:    and r0, r0, r1
-; ARM-NEXT:    uxtb r1, r0
+; ARM-NEXT:    and r1, r0, r1
 ; ARM-NEXT:    mov r0, #0
 ; ARM-NEXT:    cmp r1, r2
 ; ARM-NEXT:    movweq r0, #1
@@ -863,8 +862,7 @@
 ; ARMEB:       @ %bb.0: @ %entry
 ; ARMEB-NEXT:    ldrb r0, [r0]
 ; ARMEB-NEXT:    uxtb r2, r2
-; ARMEB-NEXT:    and r0, r0, r1
-; ARMEB-NEXT:    uxtb r1, r0
+; ARMEB-NEXT:    and r1, r0, r1
 ; ARMEB-NEXT:    mov r0, #0
 ; ARMEB-NEXT:    cmp r1, r2
 ; ARMEB-NEXT:    movweq r0, #1
@@ -872,9 +870,8 @@
 ;
 ; THUMB1-LABEL: test6:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    ldrb r0, [r0]
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    uxtb r3, r0
+; THUMB1-NEXT:    ldrb r3, [r0]
+; THUMB1-NEXT:    ands r3, r1
 ; THUMB1-NEXT:    uxtb r2, r2
 ; THUMB1-NEXT:    movs r0, #1
 ; THUMB1-NEXT:    movs r1, #0
@@ -889,8 +886,7 @@
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    ldrb r0, [r0]
 ; THUMB2-NEXT:    uxtb r2, r2
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    uxtb r1, r0
+; THUMB2-NEXT:    ands r1, r0
 ; THUMB2-NEXT:    movs r0, #0
 ; THUMB2-NEXT:    cmp r1, r2
 ; THUMB2-NEXT:    it eq
Index: test/CodeGen/X86/pr35763.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/pr35763.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+%struct.S = type <{ i16, i24, [5 x i8], i8, i16, [2 x i8] }>
+
+@z = global { i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] } { i16 -724, i8 94, i8 -18, i8 5, i8 undef, i8 96, i8 104, i8 -24, i8 10, i8 0, [5 x i8] undef }, align 8
+@tf_3_var_136 = global i64 0, align 8
+@.str = private unnamed_addr constant [6 x i8] c"%llu\0A\00", align 1
+
+; CHECK-LABEL: PR35763
+; CHECK:  movzwl  z(%rip), %eax
+; CHECK:  movzwl  z+2(%rip), %ecx
+; CHECK:  orl	  %eax, %ecx
+; CHECK:  movq	  %rcx, tf_3_var_136(%rip)
+; CHECK:  movl	  z+6(%rip), %eax
+; CHECK:  movzbl  z+10(%rip), %ecx
+; CHECK:  shlq	  $32, %rcx
+; CHECK:  orq	  %rax, %rcx
+; CHECK:  movabsq $1090921758719, %rax
+; CHECK:  andq	  %rcx, %rax
+; CHECK:  movl	  %eax, z+6(%rip)
+; CHECK:  shrq	  $32, %rax
+; CHECK:  movb	  %al, z+10(%rip)
+; CHECK:  retq
+define void @PR35763() {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 0), align 8
+  %conv = sext i16 %0 to i32
+  %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 1) to i32*), align 2
+  %bf.clear = and i32 %bf.load, 2097151
+  %bf.cast = zext i32 %bf.clear to i64
+  %conv1 = trunc i64 %bf.cast to i32
+  %or = or i32 %conv, %conv1
+  %conv2 = trunc i32 %or to i16
+  %conv3 = zext i16 %conv2 to i64
+  store i64 %conv3, i64* @tf_3_var_136, align 8
+  %bf.load4 = load i40, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2
+  %bf.clear5 = and i40 %bf.load4, -8589869057
+  store i40 %bf.clear5, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2
+  ret void
+}