This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] Fold endian-independent load sequence into a single load.
Changes PlannedPublic

Authored by ab on Oct 21 2014, 5:21 PM.

Download Raw Diff

Details

Reviewers

majnemer
hfinkel

Summary

This patch combines endian-independent load sequences, that look like this:

(x[3]<<24 | x[2]<<16 | x[1]<<8 | x[0])

into a single load (bswapped if the data endianness is different from the target endianness):

*((int*)x)

One notable issue is alignment: this patch produces 1-aligned loads, no matter the size. In practice, this means that on some targets (ARM comes to mind), the load will codegen into the same original shift/or sequence. I'm not sure if there's a way to discover alignment for this sort of situation. But no matter what, this should always be profitable.

It doesn't happen very often in practice (LNT is still running), so I tried to avoid being too expensive (this is pretty different from the other OR combines).

Also, the combining is aborted if there are *any* stores between the first and last load in the sequence. This needs a loop scanning up from the last instruction. As a safeguard, I put a max number of instructions on that loop, but by then, we already know the combine is valid, so I'm not sure if it's a good idea to abort that late.

Thanks!

Ahmed

Diff Detail

Event Timeline

ab updated this revision to Diff 15220.Oct 21 2014, 5:21 PM

ab retitled this revision from to [InstCombine] Fold endian-independent load sequence into a single load..

ab updated this object.

ab edited the test plan for this revision. (Show Details)

ab added a reviewer: majnemer.

ab added a subscriber: Unknown Object (MLST).

Herald added a subscriber: aemerson. · View Herald TranscriptOct 21 2014, 5:21 PM

hfinkel added a subscriber: hfinkel.Oct 22 2014, 8:19 AM

hfinkel added inline comments.

lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
2069	As you require these loads to all be in the same block and the initial instruction when you scan later, why don't you get the load's parent block here and bail out earlier?
2211	Okay, but why not be smart? Making an AA query is easy.

Address Hal's comments:

bail out early if the byte load instruction isn't in the same block as the shift
query AA instead of bailing out for all stores

Also, test that the unaligned folded load is made to be aligned (this is done by other InstCombines, but it's pretty important for this; I can remove the test if desired)

Keep the alignment from the first byte load instruction.
Also, refactor the ByteLoads handling to make it endian-independent (from a memory standpoint).

The previous patch relied on the later InstCombines to set the alignment, but we can also do it here. I didn't do that from the beginning because I couldn't think of a case where that's possible but the later InstCombine isn't. I still can't, but that's no reason not to do it here!

ab planned changes to this revision.Nov 13 2014, 9:09 AM

Revision Contents

Path

Size

		lib/	Transforms/	InstCombine/
	c/	lib/	Transforms/	InstCombine/

InstCombine.h

1 line

InstCombineAndOrXor.cpp

225 lines

test/

Transforms/

InstCombine/

endian-independent-load-BE.ll

167 lines

endian-independent-load-LE-aliasing.ll

155 lines

endian-independent-load-LE.ll

262 lines

Diff 15266

lib/Transforms/InstCombine/InstCombine.h

Show First 20 Lines • Show All 163 Lines • ▼ Show 20 Lines	public:
Value FoldAndOfFCmps(FCmpInst LHS, FCmpInst *RHS);		Value FoldAndOfFCmps(FCmpInst LHS, FCmpInst *RHS);
Instruction *visitAnd(BinaryOperator &I);		Instruction *visitAnd(BinaryOperator &I);
Value FoldOrOfICmps(ICmpInst LHS, ICmpInst RHS, Instruction CxtI);		Value FoldOrOfICmps(ICmpInst LHS, ICmpInst RHS, Instruction CxtI);
Value FoldOrOfFCmps(FCmpInst LHS, FCmpInst *RHS);		Value FoldOrOfFCmps(FCmpInst LHS, FCmpInst *RHS);
Instruction FoldOrWithConstants(BinaryOperator &I, Value Op, Value *A,		Instruction FoldOrWithConstants(BinaryOperator &I, Value Op, Value *A,
Value B, Value C);		Value B, Value C);
Instruction FoldXorWithConstants(BinaryOperator &I, Value Op, Value *A,		Instruction FoldXorWithConstants(BinaryOperator &I, Value Op, Value *A,
Value B, Value C);		Value B, Value C);
		Instruction *FoldEndianIndependentLoad(BinaryOperator &I);
Instruction *visitOr(BinaryOperator &I);		Instruction *visitOr(BinaryOperator &I);
Instruction *visitXor(BinaryOperator &I);		Instruction *visitXor(BinaryOperator &I);
Instruction *visitShl(BinaryOperator &I);		Instruction *visitShl(BinaryOperator &I);
Instruction *visitAShr(BinaryOperator &I);		Instruction *visitAShr(BinaryOperator &I);
Instruction *visitLShr(BinaryOperator &I);		Instruction *visitLShr(BinaryOperator &I);
Instruction *commonShiftTransforms(BinaryOperator &I);		Instruction *commonShiftTransforms(BinaryOperator &I);
Instruction FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction LHSI,		Instruction FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction LHSI,
Constant *RHSC);		Constant *RHSC);
▲ Show 20 Lines • Show All 253 Lines • Show Last 20 Lines

lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

//===- InstCombineAndOrXor.cpp --------------------------------------------===//		//===- InstCombineAndOrXor.cpp --------------------------------------------===//
//		//
// The LLVM Compiler Infrastructure		// The LLVM Compiler Infrastructure
//		//
// This file is distributed under the University of Illinois Open Source		// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.		// License. See LICENSE.TXT for details.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements the visitAnd, visitOr, and visitXor functions.		// This file implements the visitAnd, visitOr, and visitXor functions.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "InstCombine.h"		#include "InstCombine.h"
		#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/InstructionSimplify.h"		#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/IR/ConstantRange.h"		#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"		#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/CmpInstAnalysis.h"		#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
using namespace llvm;		using namespace llvm;
using namespace PatternMatch;		using namespace PatternMatch;

▲ Show 20 Lines • Show All 2,000 Lines • ▼ Show 20 Lines	Instruction InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value Op,
if (V1 == A \|\| V1 == B) {		if (V1 == A \|\| V1 == B) {
Value *NewOp = Builder->CreateAnd(V1 == A ? B : A, CI1);		Value *NewOp = Builder->CreateAnd(V1 == A ? B : A, CI1);
return BinaryOperator::CreateXor(NewOp, V1);		return BinaryOperator::CreateXor(NewOp, V1);
}		}

return nullptr;		return nullptr;
}		}

		/// \brief Match a (possibly shifted) load, part of an endian-independent load
		/// sequence.
		/// When nothing is known, try to discover the root load pointer (\p LoadPtr),
		/// and the data endianness of the load (\p IsLittleEndianLoad).
		/// If the root load pointer is known (by then, the endianness has to be known
		/// too), ensure that this (shifted?) load is consistent with them.
		static bool MatchShiftedLoad(Value V, Value &LoadPtr,
		bool &IsLittleEndianLoad, const DataLayout &DL,
		SmallVectorImpl<LoadInst *> &ByteLoads) {
		Value *LoadV = nullptr;
		ConstantInt *BitOffset = nullptr;
		bool IsShiftedLoad = true;
		const size_t ResByteSize = ByteLoads.size();

		// Try to match a shifted load:
		// LE: (LoadPtr[I] << (I * 8))
		// BE: (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8))
		if (!match(V, m_Shl(m_ZExt(m_Value(LoadV)), m_ConstantInt(BitOffset)))) {
		// If not, match a non-shifted load.
		if (match(V, m_ZExt(m_Value(LoadV))))
		IsShiftedLoad = false;
		else
		return false;
		}

		// If we found a load pointer, we have to already know the endianness.
		// If not, then we need to discover both the pointer and the endianness.
		bool IsEndiannessKnown = (LoadPtr != nullptr);
		auto IsLittleEndian = [&](bool LittleEndian) {
		if (!IsEndiannessKnown) {
		IsEndiannessKnown = true;
		IsLittleEndianLoad = LittleEndian;
		}
		return IsLittleEndianLoad == LittleEndian;
		};

		LoadInst *LoadI = dyn_cast<LoadInst>(LoadV);
		if (!LoadI \|\| !LoadI->isSimple() \|\|
		hfinkelUnsubmitted Not Done Reply Inline Actions As you require these loads to all be in the same block and the initial instruction when you scan later, why don't you get the load's parent block here and bail out earlier? hfinkel: As you require these loads to all be in the same block and the initial instruction when you…
		LoadI->getParent() != cast<Instruction>(V)->getParent() \|\|
		LoadI->getType() != Type::getInt8Ty(LoadI->getContext()))
		return false;

		GetElementPtrInst *GEPI =
		dyn_cast<GetElementPtrInst>(LoadI->getPointerOperand());

		// If we don't have a root load pointer, discover it here.
		if (!LoadPtr)
		LoadPtr = (GEPI ? GEPI->getPointerOperand() : LoadI->getPointerOperand());

		// If we didn't find a GEP, this is a non-indexed load (LoadPtr[0]).
		if (!GEPI) {
		if (LoadPtr == LoadI->getPointerOperand()) {
		// Match the non-shifted non-indexed LE load.
		// LE: (LoadPtr[0])
		if (!IsShiftedLoad && IsLittleEndian(true))
		return ByteLoads[0] = LoadI;

		// Match the shifted non-indexed BE load.
		// BE: (LoadPtr[0] << ((sizeof(EntireLoad) - 1) * 8))
		// equivalent to (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8))
		// when I == sizeof(EntireLoad) - 1
		if (IsShiftedLoad &&
		BitOffset->getValue().getZExtValue() == (ResByteSize - 1) * 8 &&
		IsLittleEndian(false)) {
		return ByteLoads[0] = LoadI;
		}
		}
		return false;
		}

		const unsigned AS = GEPI->getPointerAddressSpace();
		const unsigned OffsetSizeInBits = DL.getPointerSizeInBits(AS);

		APInt Offset(OffsetSizeInBits, 0);
		if (LoadPtr != GEPI->getPointerOperand() \|\|
		!GEPI->accumulateConstantOffset(DL, Offset))
		return false;

		const unsigned OffsetU = Offset.getZExtValue();
		if (OffsetU >= ResByteSize)
		return false;

		// Match the non-shifted BE load.
		// BE: (LoadPtr[sizeof(EntireLoad) - I - 1])
		if (!IsShiftedLoad) {
		if (IsLittleEndian(false)) {
		if (OffsetU != ResByteSize - 1)
		return false;
		return ByteLoads[ResByteSize - 1] = LoadI;
		}
		return false;
		}

		// If IsShiftedLoad, BitOffset has already been matched and is non-null.
		const unsigned BitOffsetU = BitOffset->getValue().getZExtValue();

		// Match the general case, shifted indexed load.
		// LE: (LoadPtr[I] << (I * 8)
		if (OffsetU * 8 == BitOffsetU && IsLittleEndian(true))
		return ByteLoads[OffsetU] = LoadI;
		// BE: (LoadPtr[sizeof(EntireLoad) - I - 1] << (I * 8))
		if ((ResByteSize - OffsetU - 1) * 8 == BitOffsetU && IsLittleEndian(false))
		return ByteLoads[ResByteSize - OffsetU - 1] = LoadI;

		return false;
		}

		/// \brief Replace a bytewise endian-independent load sequence (shl'd and or'd
		/// into a native-endian value) by a single native endian load, bswap'd if
		/// the load sequence endianness is different from the native target's.
		///
		/// For instance, a little-endian load on a little-endian target:
		/// Having: i32 EntireLoad, i8* LoadPtr
		/// Fold:
		/// EntireLoad =
		/// (LoadPtr[0] \| LoadPtr[1] << 8 \| LoadPtr[2] << 16 \| LoadPtr[3] << 24)
		///
		/// into:
		/// EntireLoad = ((i32)LoadPtr)
		Instruction *InstCombiner::FoldEndianIndependentLoad(BinaryOperator &I) {
		Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
		Type *ResTy = I.getType();
		const int ResBitSize = ResTy->getPrimitiveSizeInBits();
		const int ResByteSize = ResTy->getPrimitiveSizeInBits() / 8;

		if (!DL \|\| !ResTy->isIntegerTy() \|\| (ResBitSize % 8) \|\| ResBitSize < 16 \|\|
		!isPowerOf2_32(ResByteSize))
		return nullptr;

		// Keep track of the instruction that loads each byte.
		SmallVector<LoadInst *, 8> ByteLoads(ResByteSize);
		// The pointer to the data loaded from.
		Value *LoadPtr = nullptr;
		// Whether the loaded data is little-endian.
		bool IsLELoad = true;
		// Whether the target data layout is little-endian.
		const bool IsLETarget = DL->isLittleEndian();

		// Try to match OR operands as (shifted?) loads.
		// Keep track of the OR operands.
		SmallVector<Value *, 8> ByteValueWorklist;
		ByteValueWorklist.push_back(Op0);
		ByteValueWorklist.push_back(Op1);
		for (size_t i = 0; i < ByteValueWorklist.size(); ++i) {
		Value *V = ByteValueWorklist[i];

		// Try to match a (shifted?) load.
		if (MatchShiftedLoad(V, LoadPtr, IsLELoad, *DL, ByteLoads))
		continue;

		// Try to match another OR.
		Value LHS = nullptr, RHS = nullptr;
		if (match(V, m_Or(m_Value(LHS), m_Value(RHS)))) {
		ByteValueWorklist.push_back(LHS);
		ByteValueWorklist.push_back(RHS);
		} else {
		return nullptr;
		}
		}

		std::sort(ByteLoads.begin(), ByteLoads.end());

		// Check that we load all bytes. If we don't, there's one or more nullptr(s)
		// in ByteLoads, which will be sorted to to the beginning.
		if (!ByteLoads[0])
		return nullptr;

		AliasAnalysis *AA = getAnalysisIfAvailable<AliasAnalysis>();

		// Now go back up until we encounter all loads. Check all stores in between.
		// If we made it this far, we have a clean OR-tree representing an
		// endian-independent load sequence. We can (somewhat) take our time, but
		// add a max-scan safeguard just in case.
		int NumLoadsEncountered = 0;
		BasicBlock::iterator BBI = &I;
		for (unsigned MaxScanInsts = ResByteSize * 5 + 50;
		BBI != I.getParent()->begin() && MaxScanInsts; --MaxScanInsts) {
		--BBI;

		// If this a store to an aliasing location, bail out.
		hfinkelUnsubmitted Not Done Reply Inline Actions Okay, but why not be smart? Making an AA query is easy. hfinkel: Okay, but why not be smart? Making an AA query is easy.
		if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
		if (!SI->isSimple() \|\| !AA \|\|
		AA->alias(SI->getPointerOperand(), AliasAnalysis::UnknownSize,
		LoadPtr, ResByteSize))
		return nullptr;
		} else if (BBI->mayWriteToMemory()) {
		// If this is another kind of instruction, bail out.
		return nullptr;
		}

		// If we found a Load, count it.
		if (isa<LoadInst>(BBI))
		if (std::binary_search(ByteLoads.begin(), ByteLoads.end(), &*BBI))
		++NumLoadsEncountered;

		// If we found all our loads, no need to continue.
		if (NumLoadsEncountered == ResByteSize)
		break;
		}

		// If we didn't find all our loads yet, abort.
		if (NumLoadsEncountered != ResByteSize)
		return nullptr;

		const unsigned Alignment = ByteLoads[0]->getAlignment();
		const unsigned AS = cast<PointerType>(LoadPtr->getType())->getAddressSpace();
		Instruction *FoldedLoad = Builder->CreateAlignedLoad(
		Builder->CreatePointerCast(LoadPtr, ResTy->getPointerTo(AS)), Alignment);

		// If the target and loaded data endianness are different, swap them after
		// doing the target endian load.
		if (IsLETarget != IsLELoad) {
		Type *Tys[] = {FoldedLoad->getType()};
		Module *M = Builder->GetInsertBlock()->getParent()->getParent();
		Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys);
		FoldedLoad = Builder->CreateCall(TheFn, FoldedLoad);
		}
		return ReplaceInstUsesWith(I, FoldedLoad);
		}

Instruction *InstCombiner::visitOr(BinaryOperator &I) {		Instruction *InstCombiner::visitOr(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);		bool Changed = SimplifyAssociativeOrCommutative(I);
Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);		Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);

if (Value *V = SimplifyVectorOp(I))		if (Value *V = SimplifyVectorOp(I))
return ReplaceInstUsesWith(I, V);		return ReplaceInstUsesWith(I, V);

if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AT))		if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AT))
▲ Show 20 Lines • Show All 342 Lines • ▼ Show 20 Lines	if (Op0->hasOneUse() && Op1->hasOneUse() &&
match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&		match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {		match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
Value *orTrue = Builder->CreateOr(A, C);		Value *orTrue = Builder->CreateOr(A, C);
Value *orFalse = Builder->CreateOr(B, D);		Value *orFalse = Builder->CreateOr(B, D);
return SelectInst::Create(X, orTrue, orFalse);		return SelectInst::Create(X, orTrue, orFalse);
}		}
}		}

		if (Instruction *FoldedLoad = FoldEndianIndependentLoad(I)) {
		return FoldedLoad;
		}

return Changed ? &I : nullptr;		return Changed ? &I : nullptr;
}		}

Instruction *InstCombiner::visitXor(BinaryOperator &I) {		Instruction *InstCombiner::visitXor(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);		bool Changed = SimplifyAssociativeOrCommutative(I);
Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);		Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);

if (Value *V = SimplifyVectorOp(I))		if (Value *V = SimplifyVectorOp(I))
▲ Show 20 Lines • Show All 304 Lines • Show Last 20 Lines

test/Transforms/InstCombine/endian-independent-load-BE.ll

This file was added.

				; RUN: opt %s -instcombine -S \| FileCheck %s
				target datalayout = "E-m:o-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-apple-macosx10.10.0"

				; CHECK-LABEL: @test_BE_loadBE32
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: ret i32 %2
				define i32 @test_BE_loadBE32(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = shl nuw i32 %2, 24
				%4 = getelementptr inbounds i8* %buffer, i64 1
				%5 = load i8* %4, align 1
				%6 = zext i8 %5 to i32
				%7 = shl nuw nsw i32 %6, 16
				%8 = or i32 %7, %3
				%9 = getelementptr inbounds i8* %buffer, i64 2
				%10 = load i8* %9, align 1
				%11 = zext i8 %10 to i32
				%12 = shl nuw nsw i32 %11, 8
				%13 = or i32 %8, %12
				%14 = getelementptr inbounds i8* %buffer, i64 3
				%15 = load i8* %14, align 1
				%16 = zext i8 %15 to i32
				%17 = or i32 %13, %16
				ret i32 %17
				}

				; CHECK-LABEL: @test_BE_loadBE32_with_addrspace
				; CHECK: %1 = bitcast i8 addrspace(5)* %buffer to i32 addrspace(5)*
				; CHECK: %2 = load i32 addrspace(5)* %1, align 1
				; CHECK: ret i32 %2
				define i32 @test_BE_loadBE32_with_addrspace(i8 addrspace(5)* nocapture readonly %buffer) {
				%1 = load i8 addrspace(5)* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = shl nuw i32 %2, 24
				%4 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 1
				%5 = load i8 addrspace(5)* %4, align 1
				%6 = zext i8 %5 to i32
				%7 = shl nuw nsw i32 %6, 16
				%8 = or i32 %7, %3
				%9 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 2
				%10 = load i8 addrspace(5)* %9, align 1
				%11 = zext i8 %10 to i32
				%12 = shl nuw nsw i32 %11, 8
				%13 = or i32 %8, %12
				%14 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 3
				%15 = load i8 addrspace(5)* %14, align 1
				%16 = zext i8 %15 to i32
				%17 = or i32 %13, %16
				ret i32 %17
				}

				; CHECK-LABEL: @test_BE_loadLE32
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: %3 = call i32 @llvm.bswap.i32(i32 %2)
				; CHECK: ret i32 %3
				define i32 @test_BE_loadLE32(i8* nocapture readonly %buffer) #0 {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				ret i32 %17
				}

				declare void @load_buffer(i8, i32)

				; CHECK-LABEL: @test_BE_loadBE32_align4
				; CHECK: %buffer_i = alloca i32, align 4
				; CHECK: %buffer = bitcast i32* %buffer_i to i8*
				; CHECK: call void @load_buffer(i8* %buffer, i32* %buffer_int)
				; CHECK: %1 = load i32* %buffer_i, align 4
				; CHECK: ret i32 %1
				define i32 @test_BE_loadBE32_align4(i32* %buffer_int) {
				%buffer_i = alloca i32, align 4
				%buffer = bitcast i32* %buffer_i to i8*
				call void @load_buffer(i8* %buffer, i32* %buffer_int)
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = shl nuw i32 %2, 24
				%4 = getelementptr inbounds i8* %buffer, i64 1
				%5 = load i8* %4, align 1
				%6 = zext i8 %5 to i32
				%7 = shl nuw nsw i32 %6, 16
				%8 = or i32 %7, %3
				%9 = getelementptr inbounds i8* %buffer, i64 2
				%10 = load i8* %9, align 1
				%11 = zext i8 %10 to i32
				%12 = shl nuw nsw i32 %11, 8
				%13 = or i32 %8, %12
				%14 = getelementptr inbounds i8* %buffer, i64 3
				%15 = load i8* %14, align 1
				%16 = zext i8 %15 to i32
				%17 = or i32 %13, %16
				ret i32 %17
				}

				; No change expected
				; CHECK-LABEL: @test_BE_broken_sparse
				define i32 @test_BE_broken_sparse(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = shl nuw i32 %2, 24
				%4 = getelementptr inbounds i8* %buffer, i64 1
				%5 = load i8* %4, align 1
				%6 = zext i8 %5 to i32
				%7 = shl nuw nsw i32 %6, 16
				%8 = or i32 %7, %3
				%9 = getelementptr inbounds i8* %buffer, i64 2
				%10 = load i8* %9, align 1
				%11 = zext i8 %10 to i32
				%12 = shl nuw nsw i32 %11, 8
				%13 = or i32 %8, %12
				%14 = getelementptr inbounds i8* %buffer, i64 4
				; CHECK: %14 = getelementptr inbounds i8* %buffer, i64 4
				%15 = load i8* %14, align 1
				; CHECK: %15 = load i8* %14, align 1
				%16 = zext i8 %15 to i32
				; CHECK: %16 = zext i8 %15 to i32
				%17 = or i32 %13, %16
				; CHECK: %17 = or i32 %13, %16
				ret i32 %17
				; CHECK: ret i32 %17
				}
				;;
				; No change expected
				; CHECK-LABEL: @test_BE_broken_shift
				define i32 @test_BE_broken_shift(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = shl nuw i32 %2, 24
				%4 = getelementptr inbounds i8* %buffer, i64 1
				%5 = load i8* %4, align 1
				%6 = zext i8 %5 to i32
				%7 = shl nuw nsw i32 %6, 8
				; CHECK: %7 = shl nuw nsw i32 %6, 8
				%8 = or i32 %7, %3
				%9 = getelementptr inbounds i8* %buffer, i64 2
				%10 = load i8* %9, align 1
				%11 = zext i8 %10 to i32
				%12 = shl nuw nsw i32 %11, 16
				; CHECK: %12 = shl nuw nsw i32 %11, 16
				%13 = or i32 %8, %12
				; CHECK: %13 = or i32 %8, %12
				%14 = getelementptr inbounds i8* %buffer, i64 3
				%15 = load i8* %14, align 1
				%16 = zext i8 %15 to i32
				%17 = or i32 %13, %16
				; CHECK: %17 = or i32 %13, %16
				ret i32 %17
				; CHECK: ret i32 %17
				}

test/Transforms/InstCombine/endian-independent-load-LE-aliasing.ll

This file was added.

				; RUN: opt %s -basicaa -instcombine -S \| FileCheck %s
				target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-apple-macosx10.10.0"

				; CHECK-LABEL: @test_LE_load_noalias_allocastore
				; CHECK: %c0b = load i8* %buffer, align 1
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: %3 = add i32 %2, %c42
				; CHECK: ret i32 %3
				define i32 @test_LE_load_noalias_allocastore(i8* nocapture %buffer) {
				%locbuf = alloca i32, align 4
				%c0b = load i8* %buffer, align 1
				%c0 = zext i8 %c0b to i32
				store i32 %c0, i32* %locbuf
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%c0l = load i32* %locbuf
				%c42 = add i32 %c0l, 42
				store i32 %c42, i32* %locbuf
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				%c42l = load i32* %locbuf
				%18 = add i32 %17, %c42l
				ret i32 %18
				}

				; CHECK-LABEL: @test_LE_load_noalias_offsetstore
				; CHECK: %buf_off = getelementptr inbounds i8* %buffer, i64 4
				; CHECK: %c0l = load i8* %buf_off, align 1
				; CHECK: %c42 = add i8 %c0l, 42
				; CHECK: store i8 %c42, i8* %buf_off, align 1
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: %c42z = zext i8 %c42 to i32
				; CHECK: %3 = add i32 %2, %c42z
				; CHECK: ret i32 %3
				define i32 @test_LE_load_noalias_offsetstore(i8* nocapture %buffer) {
				%locbuf = alloca i32, align 4
				%c0b = load i8* %buffer, align 1
				%c0 = zext i8 %c0b to i32
				store i32 %c0, i32* %locbuf
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%buf_off = getelementptr inbounds i8* %buffer, i64 4
				%c0l = load i8* %buf_off
				%c42 = add i8 %c0l, 42
				store i8 %c42, i8* %buf_off
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				%c42l = load i8* %buf_off
				%c42z = zext i8 %c42l to i32
				%18 = add i32 %17, %c42z
				ret i32 %18
				}

				; CHECK-LABEL: @test_LE_load_alias_store
				define i32 @test_LE_load_alias_store(i8* nocapture %buffer) {
				%1 = load i8* %buffer, align 1
				; CHECK: %1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				; CHECK: %4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				; CHECK: %9 = load i8* %8, align 1
				%buf_off = getelementptr inbounds i8* %buffer, i64 2
				%c0l = load i8* %buf_off
				%c42 = add i8 %c0l, 42
				store i8 %c42, i8* %buf_off
				; CHECK: store i8 %c42, i8* %buf_off
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				; CHECK: %14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				%c42l = load i8* %buf_off
				; CHECK: %c42l = load i8* %buf_off
				%c42z = zext i8 %c42l to i32
				%18 = add i32 %17, %c42z
				ret i32 %18
				; CHECK: ret i32 %18
				}

				; CHECK-LABEL: @test_LE_load_mayalias_store
				define i32 @test_LE_load_mayalias_store(i8* nocapture %buffer, i8* nocapture %othermem) {
				%1 = load i8* %buffer, align 1
				; CHECK: %1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				; CHECK: %4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				; CHECK: %9 = load i8* %8, align 1
				%buf_off = getelementptr inbounds i8* %othermem, i64 2
				%c0l = load i8* %buf_off
				; CHECK: %c0l = load i8* %buf_off
				%c42 = add i8 %c0l, 42
				store i8 %c42, i8* %buf_off
				; CHECK: store i8 %c42, i8* %buf_off
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				; CHECK: %14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				; CHECK: %17 = or i32 %12, %16
				%c42l = load i8* %buf_off
				; CHECK: %c42l = load i8* %buf_off
				%c42z = zext i8 %c42l to i32
				%18 = add i32 %17, %c42z
				ret i32 %18
				; CHECK: ret i32 %18
				}

test/Transforms/InstCombine/endian-independent-load-LE.ll

This file was added.

				; RUN: opt %s -instcombine -S \| FileCheck %s
				target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-apple-macosx10.10.0"

				; CHECK-LABEL: @test_LE_loadLE32
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: ret i32 %2
				define i32 @test_LE_loadLE32(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				ret i32 %17
				}

				; CHECK-LABEL: @test_LE_loadLE64
				; CHECK: %1 = bitcast i8* %buffer to i64*
				; CHECK: %2 = load i64* %1, align 1
				; CHECK: ret i64 %2
				define i64 @test_LE_loadLE64(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i64
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i64
				%6 = shl nuw nsw i64 %5, 8
				%7 = or i64 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%10 = zext i8 %9 to i64
				%11 = shl nuw nsw i64 %10, 16
				%12 = or i64 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i64
				%16 = shl nuw nsw i64 %15, 24
				%17 = or i64 %12, %16
				%18 = getelementptr inbounds i8* %buffer, i64 4
				%19 = load i8* %18, align 1
				%20 = zext i8 %19 to i64
				%21 = shl nuw nsw i64 %20, 32
				%22 = or i64 %17, %21
				%23 = getelementptr inbounds i8* %buffer, i64 5
				%24 = load i8* %23, align 1
				%25 = zext i8 %24 to i64
				%26 = shl nuw nsw i64 %25, 40
				%27 = or i64 %22, %26
				%28 = getelementptr inbounds i8* %buffer, i64 6
				%29 = load i8* %28, align 1
				%30 = zext i8 %29 to i64
				%31 = shl nuw nsw i64 %30, 48
				%32 = or i64 %27, %31
				%33 = getelementptr inbounds i8* %buffer, i64 7
				%34 = load i8* %33, align 1
				%35 = zext i8 %34 to i64
				%36 = shl nuw i64 %35, 56
				%37 = or i64 %32, %36
				ret i64 %37
				}

				; CHECK-LABEL: @test_LE_loadLE32_permuted
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: ret i32 %2
				define i32 @test_LE_loadLE32_permuted(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = getelementptr inbounds i8* %buffer, i64 2
				%8 = load i8* %7, align 1
				%9 = zext i8 %8 to i32
				%10 = shl nuw nsw i32 %9, 16
				%11 = getelementptr inbounds i8* %buffer, i64 3
				%12 = load i8* %11, align 1
				%13 = zext i8 %12 to i32
				%14 = shl nuw i32 %13, 24
				%15 = or i32 %6, %2
				%16 = or i32 %14, %10
				%17 = or i32 %16, %15
				ret i32 %17
				}

				; CHECK-LABEL: @test_LE_loadLE32_with_addrspace
				; CHECK: %1 = bitcast i8 addrspace(5)* %buffer to i32 addrspace(5)*
				; CHECK: %2 = load i32 addrspace(5)* %1, align 1
				; CHECK: ret i32 %2
				define i32 @test_LE_loadLE32_with_addrspace(i8 addrspace(5)* nocapture readonly %buffer) {
				%1 = load i8 addrspace(5)* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 1
				%4 = load i8 addrspace(5)* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 2
				%9 = load i8 addrspace(5)* %8, align 1
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8 addrspace(5)* %buffer, i64 3
				%14 = load i8 addrspace(5)* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				ret i32 %17
				}

				; CHECK-LABEL: @test_LE_loadBE32
				; CHECK: %1 = bitcast i8* %buffer to i32*
				; CHECK: %2 = load i32* %1, align 1
				; CHECK: %3 = call i32 @llvm.bswap.i32(i32 %2)
				; CHECK: ret i32 %3
				define i32 @test_LE_loadBE32(i8* nocapture readonly %buffer) #0 {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = shl nuw i32 %2, 24
				%4 = getelementptr inbounds i8* %buffer, i64 1
				%5 = load i8* %4, align 1
				%6 = zext i8 %5 to i32
				%7 = shl nuw nsw i32 %6, 16
				%8 = or i32 %7, %3
				%9 = getelementptr inbounds i8* %buffer, i64 2
				%10 = load i8* %9, align 1
				%11 = zext i8 %10 to i32
				%12 = shl nuw nsw i32 %11, 8
				%13 = or i32 %8, %12
				%14 = getelementptr inbounds i8* %buffer, i64 3
				%15 = load i8* %14, align 1
				%16 = zext i8 %15 to i32
				%17 = or i32 %13, %16
				ret i32 %17
				}

				declare void @load_buffer(i8, i32)

				; CHECK-LABEL: @test_LE_loadLE32_align4
				; CHECK: %buffer_i = alloca i32, align 4
				; CHECK: %buffer = bitcast i32* %buffer_i to i8*
				; CHECK: call void @load_buffer(i8* %buffer, i32* %buffer_int)
				; CHECK: %1 = load i32* %buffer_i, align 4
				; CHECK: ret i32 %1
				define i32 @test_LE_loadLE32_align4(i32* %buffer_int) {
				%buffer_i = alloca i32, align 4
				%buffer = bitcast i32* %buffer_i to i8*
				call void @load_buffer(i8* %buffer, i32* %buffer_int)
				%1 = load i8* %buffer, align 4
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				ret i32 %17
				}

				; No change expected
				; CHECK-LABEL: @test_LE_broken_sparse
				define i32 @test_LE_broken_sparse(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 16
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 4
				; CHECK: %13 = getelementptr inbounds i8* %buffer, i64 4
				%14 = load i8* %13, align 1
				; CHECK: %14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				; CHECK: %15 = zext i8 %14 to i32
				%16 = shl nuw i32 %15, 24
				; CHECK: %16 = shl nuw i32 %15, 24
				%17 = or i32 %12, %16
				; CHECK: %17 = or i32 %12, %16
				ret i32 %17
				; CHECK: ret i32 %17
				}

				; No change expected
				; CHECK-LABEL: @test_LE_broken_shift
				define i32 @test_LE_broken_shift(i8* nocapture readonly %buffer) {
				%1 = load i8* %buffer, align 1
				%2 = zext i8 %1 to i32
				%3 = getelementptr inbounds i8* %buffer, i64 1
				%4 = load i8* %3, align 1
				%5 = zext i8 %4 to i32
				%6 = shl nuw nsw i32 %5, 8
				%7 = or i32 %6, %2
				%8 = getelementptr inbounds i8* %buffer, i64 2
				%9 = load i8* %8, align 1
				%10 = zext i8 %9 to i32
				%11 = shl nuw nsw i32 %10, 24
				; CHECK: %11 = shl nuw nsw i32 %10, 24
				%12 = or i32 %7, %11
				%13 = getelementptr inbounds i8* %buffer, i64 3
				%14 = load i8* %13, align 1
				%15 = zext i8 %14 to i32
				%16 = shl nuw nsw i32 %15, 16
				; CHECK: %16 = shl nuw nsw i32 %15, 16
				%17 = or i32 %12, %16
				; CHECK: %17 = or i32 %12, %16
				ret i32 %17
				; CHECK: ret i32 %17
				}

				; No change expected
				; CHECK-LABEL: @test_LE_broken_load
				define i32 @test_LE_broken_load(i8** %bits) {
				%bits_ptr = load i8** %bits, align 8
				; CHECK: %bits_ptr = load i8** %bits, align 8
				%1 = load i8* %bits_ptr, align 1
				%conv = zext i8 %1 to i32
				%shl = shl nuw nsw i32 %conv, 16
				%arrayidx2 = getelementptr inbounds i8* %bits_ptr, i64 1
				%2 = load i8* %arrayidx2, align 1
				%conv3 = zext i8 %2 to i32
				%shl4 = shl nuw nsw i32 %conv3, 8
				%or = or i32 %shl, %shl4
				%arrayidx6 = getelementptr inbounds i8* %bits_ptr, i64 2
				; CHECK: %arrayidx6 = getelementptr inbounds i8* %bits_ptr, i64 2
				%3 = load i8* %arrayidx6, align 1
				; CHECK: %3 = load i8* %arrayidx6, align 1
				%conv7 = zext i8 %3 to i32
				; CHECK: %conv7 = zext i8 %3 to i32
				%or8 = or i32 %or, %conv7
				; CHECK: %or8 = or i32 %or, %conv7
				ret i32 %or8
				; CHECK: ret i32 %or8
				}