Index: lib/Transforms/InstCombine/InstCombine.h
===================================================================
--- lib/Transforms/InstCombine/InstCombine.h
+++ lib/Transforms/InstCombine/InstCombine.h
@@ -169,6 +169,7 @@
                                    Value *B, Value *C);
   Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
                                     Value *B, Value *C);
+  Instruction *FoldEndianIndependentLoad(BinaryOperator &I);
   Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Intrinsics.h"
@@ -2028,6 +2029,226 @@
   return nullptr;
 }
 
+/// \brief Match a (possibly shifted) load, part of an endian-independent load
+/// sequence.
+/// When nothing is known, try to discover the root load pointer (\p LoadPtr),
+/// and the data endianness of the load (\p IsLittleEndianLoad).
+/// If the root load pointer is known (by then, the endianness has to be known
+/// too), ensure that this (shifted?) load is consistent with them.
+static bool MatchShiftedLoad(Value *V, Value *&LoadPtr,
+                             bool &IsLittleEndianLoad, const DataLayout &DL,
+                             SmallVectorImpl<LoadInst *> &ByteLoads) {
+  Value *LoadV = nullptr;
+  ConstantInt *BitOffset = nullptr;
+  bool IsShiftedLoad = true;
+  const size_t ResByteSize = ByteLoads.size();
+
+  // Try to match a shifted load:
+  // LE: (LoadPtr[I] << (I * 8))
+  // BE: (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8))
+  if (!match(V, m_Shl(m_ZExt(m_Value(LoadV)), m_ConstantInt(BitOffset)))) {
+    // If not, match a non-shifted load.
+    if (match(V, m_ZExt(m_Value(LoadV))))
+      IsShiftedLoad = false;
+    else
+      return false;
+  }
+
+  // If we found a load pointer, we *have* to already know the endianness.
+  // If not, then we need to discover both the pointer and the endianness.
+  bool IsEndiannessKnown = (LoadPtr != nullptr);
+  auto IsLittleEndian = [&](bool LittleEndian) {
+    if (!IsEndiannessKnown) {
+      IsEndiannessKnown = true;
+      IsLittleEndianLoad = LittleEndian;
+    }
+    return IsLittleEndianLoad == LittleEndian;
+  };
+
+  LoadInst *LoadI = dyn_cast<LoadInst>(LoadV);
+  if (!LoadI || !LoadI->isSimple() ||
+      LoadI->getParent() != cast<Instruction>(V)->getParent() ||
+      LoadI->getType() != Type::getInt8Ty(LoadI->getContext()))
+    return false;
+
+  GetElementPtrInst *GEPI =
+      dyn_cast<GetElementPtrInst>(LoadI->getPointerOperand());
+
+  // If we don't have a root load pointer, discover it here.
+  if (!LoadPtr)
+    LoadPtr = (GEPI ? GEPI->getPointerOperand() : LoadI->getPointerOperand());
+
+  // If we didn't find a GEP, this is a non-indexed load (LoadPtr[0]).
+  if (!GEPI) {
+    if (LoadPtr == LoadI->getPointerOperand()) {
+      // Match the non-shifted non-indexed LE load.
+      // LE: (LoadPtr[0])
+      if (!IsShiftedLoad && IsLittleEndian(true))
+        return ByteLoads[0] = LoadI;
+
+      // Match the shifted non-indexed BE load.
+      // BE: (LoadPtr[0] << ((sizeof(EntireLoad) - 1) * 8))
+      // equivalent to (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8))
+      // when I == sizeof(EntireLoad) - 1
+      if (IsShiftedLoad &&
+          BitOffset->getValue().getZExtValue() == (ResByteSize - 1) * 8 &&
+          IsLittleEndian(false)) {
+        return ByteLoads[0] = LoadI;
+      }
+    }
+    return false;
+  }
+
+  const unsigned AS = GEPI->getPointerAddressSpace();
+  const unsigned OffsetSizeInBits = DL.getPointerSizeInBits(AS);
+
+  APInt Offset(OffsetSizeInBits, 0);
+  if (LoadPtr != GEPI->getPointerOperand() ||
+      !GEPI->accumulateConstantOffset(DL, Offset))
+    return false;
+
+  const unsigned OffsetU = Offset.getZExtValue();
+  if (OffsetU >= ResByteSize)
+    return false;
+
+  // Match the non-shifted BE load.
+  // BE: (LoadPtr[sizeof(EntireLoad) - I - 1])
+  if (!IsShiftedLoad) {
+    if (IsLittleEndian(false)) {
+      if (OffsetU != ResByteSize - 1)
+        return false;
+      return ByteLoads[ResByteSize - 1] = LoadI;
+    }
+    return false;
+  }
+
+  // If IsShiftedLoad, BitOffset has already been matched and is non-null.
+  const unsigned BitOffsetU = BitOffset->getValue().getZExtValue();
+
+  // Match the general case, shifted indexed load.
+  // LE: (LoadPtr[I] << (I * 8)
+  if (OffsetU * 8 == BitOffsetU && IsLittleEndian(true))
+    return ByteLoads[OffsetU] = LoadI;
+  // BE: (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8))
+  if ((ResByteSize - OffsetU - 1) * 8 == BitOffsetU && IsLittleEndian(false))
+    return ByteLoads[ResByteSize - OffsetU - 1] = LoadI;
+
+  return false;
+}
+
+/// \brief Replace a bytewise endian-independent load sequence (shl'd and or'd
+/// into a native-endian value) by a single native endian load, bswap'd if
+/// the load sequence endianness is different from the native target's.
+///
+/// For instance, a little-endian load on a little-endian target:
+/// Having: i32 EntireLoad, i8* LoadPtr
+/// Fold:
+///   EntireLoad =
+///     (LoadPtr[0] | LoadPtr[1] << 8 | LoadPtr[2] << 16 | LoadPtr[3] << 24)
+///
+/// into:
+///   EntireLoad = *((i32*)LoadPtr)
+Instruction *InstCombiner::FoldEndianIndependentLoad(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Type *ResTy = I.getType();
+  const int ResBitSize = ResTy->getPrimitiveSizeInBits();
+  const int ResByteSize = ResTy->getPrimitiveSizeInBits() / 8;
+
+  if (!DL || !ResTy->isIntegerTy() || (ResBitSize % 8) || ResBitSize < 16 ||
+      !isPowerOf2_32(ResByteSize))
+    return nullptr;
+
+  // Keep track of the instruction that loads each byte.
+  SmallVector<LoadInst *, 8> ByteLoads(ResByteSize);
+  // The pointer to the data loaded from.
+  Value *LoadPtr = nullptr;
+  // Whether the loaded data is little-endian.
+  bool IsLELoad = true;
+  // Whether the target data layout is little-endian.
+  const bool IsLETarget = DL->isLittleEndian();
+
+  // Try to match OR operands as (shifted?) loads.
+  // Keep track of the OR operands.
+  SmallVector<Value *, 8> ByteValueWorklist;
+  ByteValueWorklist.push_back(Op0);
+  ByteValueWorklist.push_back(Op1);
+  for (size_t i = 0; i < ByteValueWorklist.size(); ++i) {
+    Value *V = ByteValueWorklist[i];
+
+    // Try to match a (shifted?) load.
+    if (MatchShiftedLoad(V, LoadPtr, IsLELoad, *DL, ByteLoads))
+      continue;
+
+    // Try to match another OR.
+    Value *LHS = nullptr, *RHS = nullptr;
+    if (match(V, m_Or(m_Value(LHS), m_Value(RHS)))) {
+      ByteValueWorklist.push_back(LHS);
+      ByteValueWorklist.push_back(RHS);
+    } else {
+      return nullptr;
+    }
+  }
+
+  std::sort(ByteLoads.begin(), ByteLoads.end());
+
+  // Check that we load all bytes. If we don't, there's one or more nullptr(s)
+  // in ByteLoads, which will be sorted to to the beginning.
+  if (!ByteLoads[0])
+    return nullptr;
+
+  AliasAnalysis *AA = getAnalysisIfAvailable<AliasAnalysis>();
+
+  // Now go back up until we encounter all loads. Check all stores in between.
+  // If we made it this far, we have a clean OR-tree representing an
+  // endian-independent load sequence.  We can (somewhat) take our time, but
+  // add a max-scan safeguard just in case.
+  int NumLoadsEncountered = 0;
+  BasicBlock::iterator BBI = &I;
+  for (unsigned MaxScanInsts = ResByteSize * 5 + 50;
+       BBI != I.getParent()->begin() && MaxScanInsts; --MaxScanInsts) {
+    --BBI;
+
+    // If this a store to an aliasing location, bail out.
+    if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (!SI->isSimple() || !AA ||
+          AA->alias(SI->getPointerOperand(), AliasAnalysis::UnknownSize,
+                    LoadPtr, ResByteSize))
+        return nullptr;
+    } else if (BBI->mayWriteToMemory()) {
+      // If this is another kind of instruction, bail out.
+      return nullptr;
+    }
+
+    // If we found a Load, count it.
+    if (isa<LoadInst>(BBI))
+      if (std::binary_search(ByteLoads.begin(), ByteLoads.end(), &*BBI))
+        ++NumLoadsEncountered;
+
+    // If we found all our loads, no need to continue.
+    if (NumLoadsEncountered == ResByteSize)
+      break;
+  }
+
+  // If we didn't find all our loads yet, abort.
+  if (NumLoadsEncountered != ResByteSize)
+    return nullptr;
+
+  const unsigned Alignment = ByteLoads[0]->getAlignment();
+  const unsigned AS = cast<PointerType>(LoadPtr->getType())->getAddressSpace();
+  Instruction *FoldedLoad = Builder->CreateAlignedLoad(
+      Builder->CreatePointerCast(LoadPtr, ResTy->getPointerTo(AS)), Alignment);
+
+  // If the target and loaded data endianness are different, swap them after
+  // doing the target endian load.
+  if (IsLETarget != IsLELoad) {
+    Type *Tys[] = {FoldedLoad->getType()};
+    Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+    Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys);
+    FoldedLoad = Builder->CreateCall(TheFn, FoldedLoad);
+  }
+  return ReplaceInstUsesWith(I, FoldedLoad);
+}
+
 Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2386,6 +2607,10 @@
     }
   }
 
+  if (Instruction *FoldedLoad = FoldEndianIndependentLoad(I)) {
+    return FoldedLoad;
+  }
+
   return Changed ? &I : nullptr;
 }
 
Index: test/Transforms/InstCombine/endian-independent-load-BE.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/endian-independent-load-BE.ll
@@ -0,0 +1,167 @@
+; RUN: opt %s -instcombine -S | FileCheck %s
+target datalayout = "E-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: @test_BE_loadBE32
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: ret i32 %2
+define i32 @test_BE_loadBE32(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = shl nuw i32 %2, 24
+  %4 = getelementptr inbounds i8* %buffer, i64 1
+  %5 = load i8* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 16
+  %8 = or i32 %7, %3
+  %9 = getelementptr inbounds i8* %buffer, i64 2
+  %10 = load i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 8
+  %13 = or i32 %8, %12
+  %14 = getelementptr inbounds i8* %buffer, i64 3
+  %15 = load i8* %14, align 1
+  %16 = zext i8 %15 to i32
+  %17 = or i32 %13, %16
+  ret i32 %17
+}
+
+; CHECK-LABEL: @test_BE_loadBE32_with_addrspace
+; CHECK: %1 = bitcast i8 addrspace(5)* %buffer to i32 addrspace(5)*
+; CHECK: %2 = load i32 addrspace(5)* %1, align 1
+; CHECK: ret i32 %2
+define i32 @test_BE_loadBE32_with_addrspace(i8 addrspace(5)* nocapture readonly %buffer) {
+  %1 = load i8 addrspace(5)* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = shl nuw i32 %2, 24
+  %4 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 1
+  %5 = load i8 addrspace(5)* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 16
+  %8 = or i32 %7, %3
+  %9 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 2
+  %10 = load i8 addrspace(5)* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 8
+  %13 = or i32 %8, %12
+  %14 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 3
+  %15 = load i8 addrspace(5)* %14, align 1
+  %16 = zext i8 %15 to i32
+  %17 = or i32 %13, %16
+  ret i32 %17
+}
+
+; CHECK-LABEL: @test_BE_loadLE32
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: %3 = call i32 @llvm.bswap.i32(i32 %2)
+; CHECK: ret i32 %3
+define i32 @test_BE_loadLE32(i8* nocapture readonly %buffer) #0 {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  ret i32 %17
+}
+
+declare void @load_buffer(i8*, i32*)
+
+; CHECK-LABEL: @test_BE_loadBE32_align4
+; CHECK: %buffer_i = alloca i32, align 4
+; CHECK: %buffer = bitcast i32* %buffer_i to i8*
+; CHECK: call void @load_buffer(i8* %buffer, i32* %buffer_int)
+; CHECK: %1 = load i32* %buffer_i, align 4
+; CHECK: ret i32 %1
+define i32 @test_BE_loadBE32_align4(i32* %buffer_int) {
+  %buffer_i = alloca i32, align 4
+  %buffer = bitcast i32* %buffer_i to i8*
+  call void @load_buffer(i8* %buffer, i32* %buffer_int)
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = shl nuw i32 %2, 24
+  %4 = getelementptr inbounds i8* %buffer, i64 1
+  %5 = load i8* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 16
+  %8 = or i32 %7, %3
+  %9 = getelementptr inbounds i8* %buffer, i64 2
+  %10 = load i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 8
+  %13 = or i32 %8, %12
+  %14 = getelementptr inbounds i8* %buffer, i64 3
+  %15 = load i8* %14, align 1
+  %16 = zext i8 %15 to i32
+  %17 = or i32 %13, %16
+  ret i32 %17
+}
+
+; No change expected
+; CHECK-LABEL: @test_BE_broken_sparse
+define i32 @test_BE_broken_sparse(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = shl nuw i32 %2, 24
+  %4 = getelementptr inbounds i8* %buffer, i64 1
+  %5 = load i8* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 16
+  %8 = or i32 %7, %3
+  %9 = getelementptr inbounds i8* %buffer, i64 2
+  %10 = load i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 8
+  %13 = or i32 %8, %12
+  %14 = getelementptr inbounds i8* %buffer, i64 4
+; CHECK: %14 = getelementptr inbounds i8* %buffer, i64 4
+  %15 = load i8* %14, align 1
+; CHECK: %15 = load i8* %14, align 1
+  %16 = zext i8 %15 to i32
+; CHECK: %16 = zext i8 %15 to i32
+  %17 = or i32 %13, %16
+; CHECK: %17 = or i32 %13, %16
+  ret i32 %17
+; CHECK: ret i32 %17
+}
+;;
+; No change expected
+; CHECK-LABEL: @test_BE_broken_shift
+define i32 @test_BE_broken_shift(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = shl nuw i32 %2, 24
+  %4 = getelementptr inbounds i8* %buffer, i64 1
+  %5 = load i8* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 8
+; CHECK: %7 = shl nuw nsw i32 %6, 8
+  %8 = or i32 %7, %3
+  %9 = getelementptr inbounds i8* %buffer, i64 2
+  %10 = load i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 16
+; CHECK: %12 = shl nuw nsw i32 %11, 16
+  %13 = or i32 %8, %12
+; CHECK: %13 = or i32 %8, %12
+  %14 = getelementptr inbounds i8* %buffer, i64 3
+  %15 = load i8* %14, align 1
+  %16 = zext i8 %15 to i32
+  %17 = or i32 %13, %16
+; CHECK: %17 = or i32 %13, %16
+  ret i32 %17
+; CHECK: ret i32 %17
+}
Index: test/Transforms/InstCombine/endian-independent-load-LE-aliasing.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/endian-independent-load-LE-aliasing.ll
@@ -0,0 +1,155 @@
+; RUN: opt %s -basicaa -instcombine -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: @test_LE_load_noalias_allocastore
+; CHECK: %c0b = load i8* %buffer, align 1
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: %3 = add i32 %2, %c42
+; CHECK: ret i32 %3
+define i32 @test_LE_load_noalias_allocastore(i8* nocapture %buffer) {
+  %locbuf = alloca i32, align 4
+  %c0b = load i8* %buffer, align 1
+  %c0 = zext i8 %c0b to i32
+  store i32 %c0, i32* %locbuf
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %c0l = load i32* %locbuf
+  %c42 = add i32 %c0l, 42
+  store i32 %c42, i32* %locbuf
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  %c42l = load i32* %locbuf
+  %18 = add i32 %17, %c42l
+  ret i32 %18
+}
+
+; CHECK-LABEL: @test_LE_load_noalias_offsetstore
+; CHECK: %buf_off = getelementptr inbounds i8* %buffer, i64 4
+; CHECK: %c0l = load i8* %buf_off, align 1
+; CHECK: %c42 = add i8 %c0l, 42
+; CHECK: store i8 %c42, i8* %buf_off, align 1
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: %c42z = zext i8 %c42 to i32
+; CHECK: %3 = add i32 %2, %c42z
+; CHECK: ret i32 %3
+define i32 @test_LE_load_noalias_offsetstore(i8* nocapture %buffer) {
+  %locbuf = alloca i32, align 4
+  %c0b = load i8* %buffer, align 1
+  %c0 = zext i8 %c0b to i32
+  store i32 %c0, i32* %locbuf
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %buf_off = getelementptr inbounds i8* %buffer, i64 4
+  %c0l = load i8* %buf_off
+  %c42 = add i8 %c0l, 42
+  store i8 %c42, i8* %buf_off
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  %c42l = load i8* %buf_off
+  %c42z = zext i8 %c42l to i32
+  %18 = add i32 %17, %c42z
+  ret i32 %18
+}
+
+; CHECK-LABEL: @test_LE_load_alias_store
+define i32 @test_LE_load_alias_store(i8* nocapture %buffer) {
+  %1 = load i8* %buffer, align 1
+; CHECK: %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+; CHECK: %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+; CHECK: %9 = load i8* %8, align 1
+  %buf_off = getelementptr inbounds i8* %buffer, i64 2
+  %c0l = load i8* %buf_off
+  %c42 = add i8 %c0l, 42
+  store i8 %c42, i8* %buf_off
+; CHECK: store i8 %c42, i8* %buf_off
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+; CHECK: %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  %c42l = load i8* %buf_off
+; CHECK: %c42l = load i8* %buf_off
+  %c42z = zext i8 %c42l to i32
+  %18 = add i32 %17, %c42z
+  ret i32 %18
+; CHECK: ret i32 %18
+}
+
+; CHECK-LABEL: @test_LE_load_mayalias_store
+define i32 @test_LE_load_mayalias_store(i8* nocapture %buffer, i8* nocapture %othermem) {
+  %1 = load i8* %buffer, align 1
+; CHECK: %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+; CHECK: %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+; CHECK: %9 = load i8* %8, align 1
+  %buf_off = getelementptr inbounds i8* %othermem, i64 2
+  %c0l = load i8* %buf_off
+; CHECK: %c0l = load i8* %buf_off
+  %c42 = add i8 %c0l, 42
+  store i8 %c42, i8* %buf_off
+; CHECK: store i8 %c42, i8* %buf_off
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+; CHECK: %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+; CHECK: %17 = or i32 %12, %16
+  %c42l = load i8* %buf_off
+; CHECK: %c42l = load i8* %buf_off
+  %c42z = zext i8 %c42l to i32
+  %18 = add i32 %17, %c42z
+  ret i32 %18
+; CHECK: ret i32 %18
+}
Index: test/Transforms/InstCombine/endian-independent-load-LE.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/endian-independent-load-LE.ll
@@ -0,0 +1,262 @@
+; RUN: opt %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: @test_LE_loadLE32
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: ret i32 %2
+define i32 @test_LE_loadLE32(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  ret i32 %17
+}
+
+; CHECK-LABEL: @test_LE_loadLE64
+; CHECK: %1 = bitcast i8* %buffer to i64*
+; CHECK: %2 = load i64* %1, align 1
+; CHECK: ret i64 %2
+define i64 @test_LE_loadLE64(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i64
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i64
+  %6 = shl nuw nsw i64 %5, 8
+  %7 = or i64 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i64
+  %11 = shl nuw nsw i64 %10, 16
+  %12 = or i64 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i64
+  %16 = shl nuw nsw i64 %15, 24
+  %17 = or i64 %12, %16
+  %18 = getelementptr inbounds i8* %buffer, i64 4
+  %19 = load i8* %18, align 1
+  %20 = zext i8 %19 to i64
+  %21 = shl nuw nsw i64 %20, 32
+  %22 = or i64 %17, %21
+  %23 = getelementptr inbounds i8* %buffer, i64 5
+  %24 = load i8* %23, align 1
+  %25 = zext i8 %24 to i64
+  %26 = shl nuw nsw i64 %25, 40
+  %27 = or i64 %22, %26
+  %28 = getelementptr inbounds i8* %buffer, i64 6
+  %29 = load i8* %28, align 1
+  %30 = zext i8 %29 to i64
+  %31 = shl nuw nsw i64 %30, 48
+  %32 = or i64 %27, %31
+  %33 = getelementptr inbounds i8* %buffer, i64 7
+  %34 = load i8* %33, align 1
+  %35 = zext i8 %34 to i64
+  %36 = shl nuw i64 %35, 56
+  %37 = or i64 %32, %36
+  ret i64 %37
+}
+
+; CHECK-LABEL: @test_LE_loadLE32_permuted
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: ret i32 %2
+define i32 @test_LE_loadLE32_permuted(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = getelementptr inbounds i8* %buffer, i64 2
+  %8 = load i8* %7, align 1
+  %9 = zext i8 %8 to i32
+  %10 = shl nuw nsw i32 %9, 16
+  %11 = getelementptr inbounds i8* %buffer, i64 3
+  %12 = load i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw i32 %13, 24
+  %15 = or i32 %6, %2
+  %16 = or i32 %14, %10
+  %17 = or i32 %16, %15
+  ret i32 %17
+}
+
+; CHECK-LABEL: @test_LE_loadLE32_with_addrspace
+; CHECK: %1 = bitcast i8 addrspace(5)* %buffer to i32 addrspace(5)*
+; CHECK: %2 = load i32 addrspace(5)* %1, align 1
+; CHECK: ret i32 %2
+define i32 @test_LE_loadLE32_with_addrspace(i8 addrspace(5)* nocapture readonly %buffer) {
+  %1 = load i8 addrspace(5)* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 1
+  %4 = load i8 addrspace(5)* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 2
+  %9 = load i8 addrspace(5)* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 3
+  %14 = load i8 addrspace(5)* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  ret i32 %17
+}
+
+; CHECK-LABEL: @test_LE_loadBE32
+; CHECK: %1 = bitcast i8* %buffer to i32*
+; CHECK: %2 = load i32* %1, align 1
+; CHECK: %3 = call i32 @llvm.bswap.i32(i32 %2)
+; CHECK: ret i32 %3
+define i32 @test_LE_loadBE32(i8* nocapture readonly %buffer) #0 {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = shl nuw i32 %2, 24
+  %4 = getelementptr inbounds i8* %buffer, i64 1
+  %5 = load i8* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 16
+  %8 = or i32 %7, %3
+  %9 = getelementptr inbounds i8* %buffer, i64 2
+  %10 = load i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 8
+  %13 = or i32 %8, %12
+  %14 = getelementptr inbounds i8* %buffer, i64 3
+  %15 = load i8* %14, align 1
+  %16 = zext i8 %15 to i32
+  %17 = or i32 %13, %16
+  ret i32 %17
+}
+
+declare void @load_buffer(i8*, i32*)
+
+; CHECK-LABEL: @test_LE_loadLE32_align4
+; CHECK: %buffer_i = alloca i32, align 4
+; CHECK: %buffer = bitcast i32* %buffer_i to i8*
+; CHECK: call void @load_buffer(i8* %buffer, i32* %buffer_int)
+; CHECK: %1 = load i32* %buffer_i, align 4
+; CHECK: ret i32 %1
+define i32 @test_LE_loadLE32_align4(i32* %buffer_int) {
+  %buffer_i = alloca i32, align 4
+  %buffer = bitcast i32* %buffer_i to i8*
+  call void @load_buffer(i8* %buffer, i32* %buffer_int)
+  %1 = load i8* %buffer, align 4
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+  ret i32 %17
+}
+
+; No change expected
+; CHECK-LABEL: @test_LE_broken_sparse
+define i32 @test_LE_broken_sparse(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 4
+; CHECK: %13 = getelementptr inbounds i8* %buffer, i64 4
+  %14 = load i8* %13, align 1
+; CHECK: %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+; CHECK: %15 = zext i8 %14 to i32
+  %16 = shl nuw i32 %15, 24
+; CHECK: %16 = shl nuw i32 %15, 24
+  %17 = or i32 %12, %16
+; CHECK: %17 = or i32 %12, %16
+  ret i32 %17
+; CHECK: ret i32 %17
+}
+
+; No change expected
+; CHECK-LABEL: @test_LE_broken_shift
+define i32 @test_LE_broken_shift(i8* nocapture readonly %buffer) {
+  %1 = load i8* %buffer, align 1
+  %2 = zext i8 %1 to i32
+  %3 = getelementptr inbounds i8* %buffer, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 8
+  %7 = or i32 %6, %2
+  %8 = getelementptr inbounds i8* %buffer, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 24
+; CHECK: %11 = shl nuw nsw i32 %10, 24
+  %12 = or i32 %7, %11
+  %13 = getelementptr inbounds i8* %buffer, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw nsw i32 %15, 16
+; CHECK: %16 = shl nuw nsw i32 %15, 16
+  %17 = or i32 %12, %16
+; CHECK: %17 = or i32 %12, %16
+  ret i32 %17
+; CHECK: ret i32 %17
+}
+
+; No change expected
+; CHECK-LABEL: @test_LE_broken_load
+define i32 @test_LE_broken_load(i8** %bits) {
+  %bits_ptr = load i8** %bits, align 8
+; CHECK: %bits_ptr = load i8** %bits, align 8
+  %1 = load i8* %bits_ptr, align 1
+  %conv = zext i8 %1 to i32
+  %shl = shl nuw nsw i32 %conv, 16
+  %arrayidx2 = getelementptr inbounds i8* %bits_ptr, i64 1
+  %2 = load i8* %arrayidx2, align 1
+  %conv3 = zext i8 %2 to i32
+  %shl4 = shl nuw nsw i32 %conv3, 8
+  %or = or i32 %shl, %shl4
+  %arrayidx6 = getelementptr inbounds i8* %bits_ptr, i64 2
+; CHECK: %arrayidx6 = getelementptr inbounds i8* %bits_ptr, i64 2
+  %3 = load i8* %arrayidx6, align 1
+; CHECK: %3 = load i8* %arrayidx6, align 1
+  %conv7 = zext i8 %3 to i32
+; CHECK: %conv7 = zext i8 %3 to i32
+  %or8 = or i32 %or, %conv7
+; CHECK: %or8 = or i32 %or, %conv7
+  ret i32 %or8
+; CHECK: ret i32 %or8
+}