diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12214,7 +12214,8 @@
     // accessing any of the loaded bytes.  If the load was a zextload/extload
     // then the result of the shift+trunc is zero/undef (handled elsewhere).
     ShAmt = SRL1C->getZExtValue();
-    if (ShAmt >= LN->getMemoryVT().getSizeInBits())
+    uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits();
+    if (ShAmt >= MemoryWidth)
       return SDValue();
 
     // Because a SRL must be assumed to *need* to zero-extend the high bits
@@ -12223,13 +12224,19 @@
     if (LN->getExtensionType() == ISD::SEXTLOAD)
       return SDValue();
 
-    unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
-    // Is the shift amount a multiple of size of ExtVT?
-    if ((ShAmt & (ExtVTBits - 1)) != 0)
-      return SDValue();
-    // Is the load width a multiple of size of ExtVT?
-    if ((SRL.getScalarValueSizeInBits() & (ExtVTBits - 1)) != 0)
-      return SDValue();
+    // Avoid reading outside the memory accessed by the original load (could
+    // happened if we only adjust the load base pointer by ShAmt). Instead we
+    // try to narrow the load even further. The typical scenario here is:
+    //   (i64 (truncate (i96 (srl (load x), 64)))) ->
+    //     (i64 (truncate (i96 (zextload (load i32 + offset) from i32))))
+    if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) {
+      // Don't replace sextload by zextload.
+      if (ExtType == ISD::SEXTLOAD)
+        return SDValue();
+      // Narrow the load.
+      ExtType = ISD::ZEXTLOAD;
+      ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
+    }
 
     // If the SRL is only used by a masking AND, we may be able to adjust
     // the ExtVT to make the AND redundant.
@@ -12241,7 +12248,7 @@
         EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
                                          ShiftMask.countTrailingOnes());
         // If the mask is smaller, recompute the type.
-        if ((ExtVTBits > MaskedVT.getScalarSizeInBits()) &&
+        if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
             TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
           ExtVT = MaskedVT;
       }
diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
--- a/llvm/test/CodeGen/ARM/shift-combine.ll
+++ b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -302,9 +302,7 @@
 ;
 ; CHECK-BE-LABEL: test_lshr_load64_4_unaligned:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    ldr r1, [r0]
-; CHECK-BE-NEXT:    ldrh r0, [r0, #4]
-; CHECK-BE-NEXT:    orr r0, r0, r1, lsl #16
+; CHECK-BE-NEXT:    ldr r0, [r0, #2]
 ; CHECK-BE-NEXT:    bx lr
 ;
 ; CHECK-THUMB-LABEL: test_lshr_load64_4_unaligned:
@@ -341,9 +339,7 @@
 ;
 ; CHECK-BE-LABEL: test_lshr_load64_1_lsb:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    ldr r1, [r0]
-; CHECK-BE-NEXT:    ldrb r0, [r0, #4]
-; CHECK-BE-NEXT:    orr r0, r0, r1, lsl #8
+; CHECK-BE-NEXT:    ldr r0, [r0, #1]
 ; CHECK-BE-NEXT:    bx lr
 ;
 ; CHECK-THUMB-LABEL: test_lshr_load64_1_lsb:
@@ -441,23 +437,17 @@
 define arm_aapcscc i32 @test_lshr_load4_fail(i64* %a) {
 ; CHECK-ARM-LABEL: test_lshr_load4_fail:
 ; CHECK-ARM:       @ %bb.0: @ %entry
-; CHECK-ARM-NEXT:    ldrd r0, r1, [r0]
-; CHECK-ARM-NEXT:    lsr r0, r0, #8
-; CHECK-ARM-NEXT:    orr r0, r0, r1, lsl #24
+; CHECK-ARM-NEXT:    ldr r0, [r0, #1]
 ; CHECK-ARM-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: test_lshr_load4_fail:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    ldrd r0, r1, [r0]
-; CHECK-BE-NEXT:    lsr r1, r1, #8
-; CHECK-BE-NEXT:    orr r0, r1, r0, lsl #24
+; CHECK-BE-NEXT:    ldr r0, [r0, #3]
 ; CHECK-BE-NEXT:    bx lr
 ;
 ; CHECK-THUMB-LABEL: test_lshr_load4_fail:
 ; CHECK-THUMB:       @ %bb.0: @ %entry
-; CHECK-THUMB-NEXT:    ldrd r0, r1, [r0]
-; CHECK-THUMB-NEXT:    lsrs r0, r0, #8
-; CHECK-THUMB-NEXT:    orr.w r0, r0, r1, lsl #24
+; CHECK-THUMB-NEXT:    ldr.w r0, [r0, #1]
 ; CHECK-THUMB-NEXT:    bx lr
 ;
 ; CHECK-ALIGN-LABEL: test_lshr_load4_fail:
diff --git a/llvm/test/CodeGen/X86/shift-folding.ll b/llvm/test/CodeGen/X86/shift-folding.ll
--- a/llvm/test/CodeGen/X86/shift-folding.ll
+++ b/llvm/test/CodeGen/X86/shift-folding.ll
@@ -88,9 +88,7 @@
 ; CHECK-LABEL: srl_load_narrowing1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl (%eax), %eax
-; CHECK-NEXT:    shrl $8, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    movzwl 1(%eax), %eax
 ; CHECK-NEXT:    retl
   %tmp1 = load i32, i32* %arg, align 1
   %tmp2 = lshr i32 %tmp1, 8