Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -681,6 +681,34 @@
   return nullptr;
 }
 
+static Instruction *narrowLoad(TruncInst &Trunc,
+                               InstCombiner::BuilderTy &Builder,
+                               const DataLayout &DL) {
+  // Check the layout to ensure we are not creating an unsupported operation.
+  // TODO: Create a GEP to offset the load?
+  if (!DL.isLittleEndian())
+    return nullptr;
+  unsigned NarrowBitWidth = Trunc.getDestTy()->getPrimitiveSizeInBits();
+  if (!DL.isLegalInteger(NarrowBitWidth))
+    return nullptr;
+
+  // Match a truncated load with no other uses.
+  Value *X;
+  if (!match(Trunc.getOperand(0), m_OneUse(m_Load(m_Value(X)))))
+    return nullptr;
+  LoadInst *WideLoad = cast<LoadInst>(Trunc.getOperand(0));
+  if (!WideLoad->isSimple())
+    return nullptr;
+
+  // trunc (load X) --> load (bitcast X)
+  PointerType *PtrTy = PointerType::get(Trunc.getDestTy(),
+                                        WideLoad->getPointerAddressSpace());
+  Value *Bitcast = Builder.CreateBitCast(X, PtrTy);
+  LoadInst *NarrowLoad = new LoadInst(Bitcast);
+  NarrowLoad->setAlignment(WideLoad->getAlignment());
+  return NarrowLoad;
+}
+
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
@@ -840,6 +868,9 @@
   if (Instruction *I = foldVecTruncToExtElt(CI, *this))
     return I;
 
+  if (Instruction *NewLoad = narrowLoad(CI, Builder, DL))
+    return NewLoad;
+
   return nullptr;
 }
 
Index: llvm/test/Transforms/InstCombine/trunc-load.ll
===================================================================
--- llvm/test/Transforms/InstCombine/trunc-load.ll
+++ llvm/test/Transforms/InstCombine/trunc-load.ll
@@ -2,28 +2,44 @@
 ; RUN: opt < %s -instcombine -S -data-layout="e-n16:32:64" | FileCheck %s --check-prefixes=CHECK,LE
 ; RUN: opt < %s -instcombine -S -data-layout="E-n16:32:64" | FileCheck %s --check-prefixes=CHECK,BE
 
+; On little-endian, we can narrow the load without an offset.
+
 define i32 @truncload(i64* %ptr) {
-; CHECK-LABEL: @truncload(
-; CHECK-NEXT:    [[X:%.*]] = load i64, i64* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = trunc i64 [[X]] to i32
-; CHECK-NEXT:    ret i32 [[R]]
+; LE-LABEL: @truncload(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i64* [[PTR:%.*]] to i32*
+; LE-NEXT:    [[R:%.*]] = load i32, i32* [[TMP1]], align 4
+; LE-NEXT:    ret i32 [[R]]
+;
+; BE-LABEL: @truncload(
+; BE-NEXT:    [[X:%.*]] = load i64, i64* [[PTR:%.*]], align 4
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[X]] to i32
+; BE-NEXT:    ret i32 [[R]]
 ;
   %x = load i64, i64* %ptr
   %r = trunc i64 %x to i32
   ret i32 %r
 }
 
+; Preserve alignment.
+
 define i16 @truncload_align(i32* %ptr) {
-; CHECK-LABEL: @truncload_align(
-; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[PTR:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = trunc i32 [[X]] to i16
-; CHECK-NEXT:    ret i16 [[R]]
+; LE-LABEL: @truncload_align(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[PTR:%.*]] to i16*
+; LE-NEXT:    [[R:%.*]] = load i16, i16* [[TMP1]], align 16
+; LE-NEXT:    ret i16 [[R]]
+;
+; BE-LABEL: @truncload_align(
+; BE-NEXT:    [[X:%.*]] = load i32, i32* [[PTR:%.*]], align 16
+; BE-NEXT:    [[R:%.*]] = trunc i32 [[X]] to i16
+; BE-NEXT:    ret i16 [[R]]
 ;
   %x = load i32, i32* %ptr, align 16
   %r = trunc i32 %x to i16
   ret i16 %r
 }
 
+; Negative test - extra use means we would not eliminate the original load.
+
 declare void @use(i64)
 
 define i32 @truncload_extra_use(i64* %ptr) {
@@ -39,6 +55,8 @@
   ret i32 %r
 }
 
+; Negative test - don't create a load if the type is not allowed by the data-layout.
+
 define i8 @truncload_type(i64* %ptr) {
 ; CHECK-LABEL: @truncload_type(
 ; CHECK-NEXT:    [[X:%.*]] = load i64, i64* [[PTR:%.*]], align 2
@@ -50,6 +68,8 @@
   ret i8 %r
 }
 
+; Negative test - don't transform volatiles.
+
 define i32 @truncload_volatile(i64* %ptr) {
 ; CHECK-LABEL: @truncload_volatile(
 ; CHECK-NEXT:    [[X:%.*]] = load volatile i64, i64* [[PTR:%.*]], align 8
@@ -61,11 +81,18 @@
   ret i32 %r
 }
 
+; Preserve address space.
+
 define i32 @truncload_address_space(i64 addrspace(1)* %ptr) {
-; CHECK-LABEL: @truncload_address_space(
-; CHECK-NEXT:    [[X:%.*]] = load i64, i64 addrspace(1)* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = trunc i64 [[X]] to i32
-; CHECK-NEXT:    ret i32 [[R]]
+; LE-LABEL: @truncload_address_space(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast i64 addrspace(1)* [[PTR:%.*]] to i32 addrspace(1)*
+; LE-NEXT:    [[R:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
+; LE-NEXT:    ret i32 [[R]]
+;
+; BE-LABEL: @truncload_address_space(
+; BE-NEXT:    [[X:%.*]] = load i64, i64 addrspace(1)* [[PTR:%.*]], align 4
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[X]] to i32
+; BE-NEXT:    ret i32 [[R]]
 ;
   %x = load i64, i64 addrspace(1)* %ptr, align 4
   %r = trunc i64 %x to i32