Diff 85709

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 371 Lines • ▼ Show 20 Lines	private:
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,		SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);		bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);		SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,		SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
SDValue InnerPos, SDValue InnerNeg,		SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,		unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);		const SDLoc &DL);
SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);		SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
		SDValue MatchLoadCombine(SDNode *N);
SDValue ReduceLoadWidth(SDNode *N);		SDValue ReduceLoadWidth(SDNode *N);
SDValue ReduceLoadOpStoreWidth(SDNode *N);		SDValue ReduceLoadOpStoreWidth(SDNode *N);
SDValue splitMergedValStore(StoreSDNode *ST);		SDValue splitMergedValStore(StoreSDNode *ST);
SDValue TransformFPLoadStorePair(SDNode *N);		SDValue TransformFPLoadStorePair(SDNode *N);
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);		SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);		SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
SDValue reduceBuildVecToShuffle(SDNode *N);		SDValue reduceBuildVecToShuffle(SDNode *N);
SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,		SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
▲ Show 20 Lines • Show All 3,592 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitOR(SDNode *N) {
if (N0.getOpcode() == N1.getOpcode())		if (N0.getOpcode() == N1.getOpcode())
if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))		if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
return Tmp;		return Tmp;

// See if this is some rotate idiom.		// See if this is some rotate idiom.
if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))		if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
return SDValue(Rot, 0);		return SDValue(Rot, 0);

		if (SDValue Load = MatchLoadCombine(N))
		return Load;

// Simplify the operands using demanded-bits information.		// Simplify the operands using demanded-bits information.
if (!VT.isVector() &&		if (!VT.isVector() &&
SimplifyDemandedBits(SDValue(N, 0)))		SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);		return SDValue(N, 0);

return SDValue();		return SDValue();
}		}

▲ Show 20 Lines • Show All 355 Lines • ▼ Show 20 Lines	static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG,
} else IsIndexSignExt = false;		} else IsIndexSignExt = false;

int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();		int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);		return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);
}		}
};		};
} // namespace		} // namespace

		namespace {
		/// Represents known origin of an individual byte in load combine pattern. The
		/// value of the byte is either constant zero or comes from memory.
		struct ByteProvider {
		// For constant zero providers Load is set to nullptr. For memory providers
		// Load represents the node which loads the byte from memory.
		// ByteOffset is the offset of the byte in the value produced by the load.
		LoadSDNode *Load;
		unsigned ByteOffset;

		ByteProvider() : Load(nullptr), ByteOffset(0) {}

		static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
		return ByteProvider(Load, ByteOffset);
		}
		static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }

		bool isConstantZero() { return !Load; }
		bool isMemory() { return Load; }

		bool operator==(const ByteProvider &Other) const {
		return Other.Load == Load && Other.ByteOffset == ByteOffset;
		}

		private:
		ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
		: Load(Load), ByteOffset(ByteOffset) {}
		};

		/// Recursively traverses the expression calculating the origin of the requested
		/// byte of the given value. Returns None if the provider can't be calculated.
		///
		/// For all the values except the root of the expression verifies that the value
		/// has exactly one use and if it's not true return None. This way if the origin
		/// of the byte is returned it's guaranteed that the values which contribute to
		/// the byte are not used outside of this expression.
		///
		/// Because the parts of the expression are not allowed to have more than one
		/// use this function iterates over trees, not DAGs. So it never visits the same
		/// node more than once.
		const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
		unsigned Depth,
		bool Root = false) {
		// Typical i64 by i8 pattern requires recursion up to 8 calls depth
		if (Depth == 10)
		return None;

		if (!Root && !Op.hasOneUse())
		return None;

		assert(Op.getValueType().isScalarInteger() && "can't handle other types");
		unsigned BitWidth = Op.getValueSizeInBits();
		if (BitWidth % 8 != 0)
		return None;
		unsigned ByteWidth = BitWidth / 8;
		assert(Index < ByteWidth && "invalid index requested");

		switch (Op.getOpcode()) {
		case ISD::OR: {
		auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
		if (!LHS)
		return None;
		auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
		if (!RHS)
		return None;

		if (LHS->isConstantZero())
		return RHS;
		else if (RHS->isConstantZero())
		return LHS;
		else
		return None;
		}
		case ISD::SHL: {
		auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
		if (!ShiftOp)
		return None;

		uint64_t BitShift = ShiftOp->getZExtValue();
		if (BitShift % 8 != 0)
		return None;
		uint64_t ByteShift = BitShift / 8;

		return Index < ByteShift
		? ByteProvider::getConstantZero()
		: calculateByteProvider(Op->getOperand(0), Index - ByteShift,
		Depth + 1);
		}
		case ISD::ZERO_EXTEND: {
		SDValue NarrowOp = Op->getOperand(0);
		unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
		if (NarrowBitWidth % 8 != 0)
		return None;
		uint64_t NarrowByteWidth = NarrowBitWidth / 8;

		return Index >= NarrowByteWidth
		? ByteProvider::getConstantZero()
		: calculateByteProvider(NarrowOp, Index, Depth + 1);
		}
		case ISD::LOAD: {
		auto L = cast<LoadSDNode>(Op.getNode());

		// TODO: support ext loads
		if (L->isVolatile() \|\| L->isIndexed() \|\|
		L->getExtensionType() != ISD::NON_EXTLOAD)
		return None;

		return ByteProvider::getMemory(L, Index);
		}
		}

		return None;
		}
		} // namespace

		/// Match a pattern where a wide type scalar value is loaded by several narrow
		/// loads and combined by shifts and ors. Fold it into a single load or a load
		/// and a BSWAP if the targets supports it.
		///
		/// Assuming little endian target:
		/// i8 *a = ...
		/// i32 val = a[0] \| (a[1] << 8) \| (a[2] << 16) \| (a[3] << 24)
		/// =>
		/// i32 val = *((i32)a)
		///
		/// i8 *a = ...
		/// i32 val = (a[0] << 24) \| (a[1] << 16) \| (a[2] << 8) \| a[3]
		/// =>
		/// i32 val = BSWAP(*((i32)a))
		///
		/// TODO: This rule matches complex patterns with OR node roots and doesn't
		/// interact well with the worklist mechanism. When a part of the pattern is
		/// updated (e.g. one of the loads) its direct users are put into the worklist,
		/// but the root node of the pattern which triggers the load combine is not
		/// necessarily a direct user of the changed node. For example, once the address
		/// of t28 load is reassociated load combine won't be triggered:
		/// t25: i32 = add t4, Constant:i32<2>
		/// t26: i64 = sign_extend t25
		/// t27: i64 = add t2, t26
		/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
		/// t29: i32 = zero_extend t28
		/// t32: i32 = shl t29, Constant:i8<8>
		/// t33: i32 = or t23, t32
		/// As a possible fix visitLoad can check if the load can be a part of a load
		/// combine pattern and add corresponding OR roots to the worklist.
		SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
		assert(N->getOpcode() == ISD::OR &&
		"Can only match load combining against OR nodes");

		// Handles simple types only
		EVT VT = N->getValueType(0);
		if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
		return SDValue();
		unsigned ByteWidth = VT.getSizeInBits() / 8;

		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		// Before legalize we can introduce too wide illegal loads which will be later
		// split into legal sized loads. This enables us to combine i64 load by i8
		// patterns to a couple of i32 loads on 32 bit targets.
		if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
		return SDValue();

		auto LittleEndianByteAt = [](unsigned BW, unsigned i) { return i; };
		auto BigEndianByteAt = [](unsigned BW, unsigned i) { return BW - i - 1; };

		Optional<BaseIndexOffset> Base;
		SDValue Chain;

		SmallSet<LoadSDNode *, 8> Loads;
		LoadSDNode *FirstLoad = nullptr;

		bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
		auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt;

		// Check if all the bytes of the OR we are looking at are loaded from the same
		// base address. Collect bytes offsets from Base address in ByteOffsets.
		SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
		for (unsigned i = 0; i < ByteWidth; i++) {
		auto P = calculateByteProvider(SDValue(N, 0), i, 0, /Root=/true);
		if (!P \|\| !P->isMemory()) // All the bytes must be loaded from memory
		return SDValue();

		LoadSDNode *L = P->Load;
		assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
		(L->getExtensionType() == ISD::NON_EXTLOAD) &&
		"Must be enforced by calculateByteProvider");
		assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");

		// All loads must share the same chain
		SDValue LChain = L->getChain();
		if (!Chain)
		Chain = LChain;
		else if (Chain != LChain)
		return SDValue();

		// Loads must share the same base address
		BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
		if (!Base)
		Base = Ptr;
		else if (!Base->equalBaseIndex(Ptr))
		return SDValue();

		// Calculate the offset of the current byte from the base address
		unsigned LoadBitWidth = L->getMemoryVT().getSizeInBits();
		assert(LoadBitWidth % 8 == 0 &&
		"can only analyze providers for individual bytes not bit");
		unsigned LoadByteWidth = LoadBitWidth / 8;
		int64_t MemoryByteOffset = ByteAt(LoadByteWidth, P->ByteOffset);
		int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset;
		ByteOffsets[i] = ByteOffsetFromBase;

		// Remember the first byte load
		if (ByteOffsetFromBase == 0)
		FirstLoad = L;

		Loads.insert(L);
		}
		assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
		"memory, so there must be at least one load which produces the value");
		assert(Base && "Base address of the accessed memory location must be set");

		// Check if the bytes of the OR we are looking at match with either big or
		// little endian value load
		bool BigEndian = true, LittleEndian = true;
		for (unsigned i = 0; i < ByteWidth; i++) {
		LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i);
		BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i);
		if (!BigEndian && !LittleEndian)
		return SDValue();
		}
		assert((BigEndian != LittleEndian) && "should be either or");
		assert(FirstLoad && "must be set");

		// The node we are looking at matches with the pattern, check if we can
		// replace it with a single load and bswap if needed.

		// If the load needs byte swap check if the target supports it
		bool NeedsBswap = IsBigEndianTarget != BigEndian;

		// Before legalize we can introduce illegal bswaps which will be later
		// converted to an explicit bswap sequence. This way we end up with a single
		// load and byte shuffling instead of several loads and byte shuffling.
		if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
		return SDValue();

		// Check that a load of the wide type is both allowed and fast on the target
		bool Fast = false;
		bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
		VT, FirstLoad->getAddressSpace(),
		FirstLoad->getAlignment(), &Fast);
		if (!Allowed \|\| !Fast)
		return SDValue();

		SDValue NewLoad =
		DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
		FirstLoad->getPointerInfo(), FirstLoad->getAlignment());

		// Transfer chain users from old loads to the new load.
		for (LoadSDNode *L : Loads)
		DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));

		return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
		}

SDValue DAGCombiner::visitXOR(SDNode *N) {		SDValue DAGCombiner::visitXOR(SDNode *N) {
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();		EVT VT = N0.getValueType();

// fold vector ops		// fold vector ops
if (VT.isVector()) {		if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))		if (SDValue FoldedVOp = SimplifyVBinOp(N))
▲ Show 20 Lines • Show All 11,204 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/load-combine-big-endian.ll

				; RUN: llc < %s -mtriple=arm64eb-unknown \| FileCheck %s

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_big_endian:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) (((i16) p[0] << 8) \| (i16) p[1]) << 16) \| (i32) (((i16) p[3] << 8) \| (i16) p[4])
				define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp2, 8
				%tmp7 = or i16 %tmp6, %tmp5
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp10, 8
				%tmp15 = or i16 %tmp14, %tmp13
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp16, 16
				%tmp19 = or i32 %tmp18, %tmp17
				ret i32 %tmp19
				}

				; i16* p; // p is 4 byte aligned
				; ((i32) p[0] << 16) \| (i32) p[1]
				define i32 @load_i32_by_i16(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = load i16, i16* %tmp, align 4
				%tmp2 = zext i16 %tmp1 to i32
				%tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
				%tmp4 = load i16, i16* %tmp3, align 1
				%tmp5 = zext i16 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp2, 16
				%tmp7 = or i32 %tmp6, %tmp5
				ret i32 %tmp7
				}

				; i16* p_16; // p_16 is 4 byte aligned
				; i8* p_8 = (i8*) p_16;
				; (i32) (p_16[0] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i16_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_i8:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = bitcast i32* %arg to i8*
				%tmp2 = load i16, i16* %tmp, align 4
				%tmp3 = zext i16 %tmp2 to i32
				%tmp4 = shl nuw nsw i32 %tmp3, 16
				%tmp5 = getelementptr inbounds i8, i8* %tmp1, i32 2
				%tmp6 = load i8, i8* %tmp5, align 1
				%tmp7 = zext i8 %tmp6 to i32
				%tmp8 = shl nuw nsw i32 %tmp7, 8
				%tmp9 = getelementptr inbounds i8, i8* %tmp1, i32 3
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = or i32 %tmp8, %tmp11
				%tmp13 = or i32 %tmp12, %tmp4
				ret i32 %tmp13
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr x8, [x0]
				; CHECK-NEXT: rev x0, x8
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr x0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

llvm/trunk/test/CodeGen/AArch64/load-combine.ll

				; RUN: llc < %s -mtriple=arm64-unknown \| FileCheck %s

				; i8* p; // p is 1 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_unaligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_unaligned:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_aligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_aligned:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 4
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: ldr w8, [x0]
				; CHECK-NEXT: rev w0, w8
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr x0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr x8, [x0]
				; CHECK-NEXT: rev x0, x8
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll

				; RUN: llc < %s -mtriple=armeb-unknown \| FileCheck %s
				; RUN: llc < %s -mtriple=armv6eb-unknown \| FileCheck %s --check-prefix=CHECK-ARMv6

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_big_endian:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_big_endian:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 4 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; BSWAP is not supported by 32 bit target
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: ldr r0, [r0]
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: rev r0, r0
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 4
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) (((i16) p[0] << 8) \| (i16) p[1]) << 16) \| (i32) (((i16) p[3] << 8) \| (i16) p[4])
				define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i16_by_i8_big_endian:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp2, 8
				%tmp7 = or i16 %tmp6, %tmp5
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp10, 8
				%tmp15 = or i16 %tmp14, %tmp13
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp16, 16
				%tmp19 = or i32 %tmp18, %tmp17
				ret i32 %tmp19
				}

				; i16* p; // p is 4 byte aligned
				; ((i32) p[0] << 16) \| (i32) p[1]
				define i32 @load_i32_by_i16(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i16:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = load i16, i16* %tmp, align 4
				%tmp2 = zext i16 %tmp1 to i32
				%tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
				%tmp4 = load i16, i16* %tmp3, align 1
				%tmp5 = zext i16 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp2, 16
				%tmp7 = or i32 %tmp6, %tmp5
				ret i32 %tmp7
				}

				; i16* p_16; // p_16 is 4 byte aligned
				; i8* p_8 = (i8*) p_16;
				; (i32) (p_16[0] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i16_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_i8:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i16_i8:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = bitcast i32* %arg to i8*
				%tmp2 = load i16, i16* %tmp, align 4
				%tmp3 = zext i16 %tmp2 to i32
				%tmp4 = shl nuw nsw i32 %tmp3, 16
				%tmp5 = getelementptr inbounds i8, i8* %tmp1, i32 2
				%tmp6 = load i8, i8* %tmp5, align 1
				%tmp7 = zext i8 %tmp6 to i32
				%tmp8 = shl nuw nsw i32 %tmp7, 8
				%tmp9 = getelementptr inbounds i8, i8* %tmp1, i32 3
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = or i32 %tmp8, %tmp11
				%tmp13 = or i32 %tmp12, %tmp4
				ret i32 %tmp13
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr{{.*}}r0
				; CHECK: ldr{{.*}}r0
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
				; CHECK-ARMv6: ldrd r2, r3, [r0]
				; CHECK-ARMv6: rev r0, r3
				; CHECK-ARMv6: rev r1, r2
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr r2, [r0]
				; CHECK: ldr r1, [r0, #4]
				; CHECK: mov r0, r2
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8:
				; CHECK-ARMv6: ldrd r0, r1, [r0]
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

llvm/trunk/test/CodeGen/ARM/load-combine.ll

				; RUN: llc < %s -mtriple=arm-unknown \| FileCheck %s
				; RUN: llc < %s -mtriple=armv6-unknown \| FileCheck %s --check-prefix=CHECK-ARMv6

				; i8* p; // p is 1 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_unaligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_unaligned:
				; CHECK: ldrb{{.*}}r0
				; CHECK: ldrb{{.*}}r0
				; CHECK: ldrb{{.*}}r0
				; CHECK: ldrb{{.*}}r0
				; CHECK: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_unaligned:
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: orr
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_aligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_aligned:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_aligned:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 4
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; BSWAP is not supported by 32 bit target
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: ldr r0, [r0]
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: rev r0, r0
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr r2, [r0]
				; CHECK-NEXT: ldr r1, [r0, #4]
				; CHECK-NEXT: mov r0, r2
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8:
				; CHECK-ARMv6: ldrd r0, r1, [r0]
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr{{.*}}r0
				; CHECK: ldr{{.*}}r0
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
				; CHECK-ARMv6: ldrd r2, r3, [r0]
				; CHECK-ARMv6: rev r0, r3
				; CHECK-ARMv6: rev r1, r2
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

llvm/trunk/test/CodeGen/X86/load-combine.ll

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s
				; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s --check-prefix=CHECK64

				; i8* p;
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp5, 8
				%tmp7 = or i32 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i32
				%tmp11 = shl nuw nsw i32 %tmp10, 16
				%tmp12 = or i32 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i32
				%tmp16 = shl nuw nsw i32 %tmp15, 24
				%tmp17 = or i32 %tmp12, %tmp16
				ret i32 %tmp17
				}

				; i8* p;
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: bswapl %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i16* p;
				; (i32) p[0] \| ((i32) p[1] << 16)
				define i32 @load_i32_by_i16(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i16*
				%tmp1 = load i16, i16* %tmp, align 1
				%tmp2 = zext i16 %tmp1 to i32
				%tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
				%tmp4 = load i16, i16* %tmp3, align 1
				%tmp5 = zext i16 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp5, 16
				%tmp7 = or i32 %tmp6, %tmp2
				ret i32 %tmp7
				}

				; i16* p_16;
				; i8* p_8 = (i8*) p_16;
				; (i32) p_16[0] \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i16_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i16*
				%tmp1 = bitcast i32* %arg to i8*
				%tmp2 = load i16, i16* %tmp, align 1
				%tmp3 = zext i16 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp1, i32 2
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = getelementptr inbounds i8, i8* %tmp1, i32 3
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i32
				%tmp11 = shl nuw nsw i32 %tmp10, 24
				%tmp12 = or i32 %tmp7, %tmp11
				%tmp13 = or i32 %tmp12, %tmp3
				ret i32 %tmp13
				}


				; i8* p;
				; (i32) ((i16) p[0] \| ((i16) p[1] << 8)) \| (((i32) ((i16) p[3] \| ((i16) p[4] << 8)) << 16)
				define i32 @load_i32_by_i16_by_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16_by_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp5, 8
				%tmp7 = or i16 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp13, 8
				%tmp15 = or i16 %tmp14, %tmp10
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp17, 16
				%tmp19 = or i32 %tmp18, %tmp16
				ret i32 %tmp19
				}

				; i8* p;
				; ((i32) (((i16) p[0] << 8) \| (i16) p[1]) << 16) \| (i32) (((i16) p[3] << 8) \| (i16) p[4])
				define i32 @load_i32_by_i16_by_i8_bswap(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8_bswap:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16_by_i8_bswap:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: bswapl %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp2, 8
				%tmp7 = or i16 %tmp6, %tmp5
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp10, 8
				%tmp15 = or i16 %tmp14, %tmp13
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp16, 16
				%tmp19 = or i32 %tmp18, %tmp17
				ret i32 %tmp19
				}

				; i8* p;
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movl (%ecx), %eax
				; CHECK-NEXT: movl 4(%ecx), %edx
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i64_by_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movq (%rdi), %rax
				; CHECK64-NEXT: retq

				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p;
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %edx
				; CHECK-NEXT: movl 4(%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: bswapl %edx
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i64_by_i8_bswap:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movq (%rdi), %rax
				; CHECK64-NEXT: bswapq %rax
				; CHECK64-NEXT: retq

				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

				; Part of the load by bytes pattern is used outside of the pattern
				; i8* p;
				; i32 x = (i32) p[1]
				; res = ((i32) p[0] << 24) \| (x << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				; x \| res
				define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap_uses:
				; CHECK: # BB#0:
				; CHECK-NEXT: pushl %esi
				; CHECK-NEXT: .Lcfi0:
				; CHECK-NEXT: .cfi_def_cfa_offset 8
				; CHECK-NEXT: .Lcfi1:
				; CHECK-NEXT: .cfi_offset %esi, -8
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movzbl (%eax), %ecx
				; CHECK-NEXT: shll $24, %ecx
				; CHECK-NEXT: movzbl 1(%eax), %edx
				; CHECK-NEXT: movl %edx, %esi
				; CHECK-NEXT: shll $16, %esi
				; CHECK-NEXT: orl %ecx, %esi
				; CHECK-NEXT: movzbl 2(%eax), %ecx
				; CHECK-NEXT: shll $8, %ecx
				; CHECK-NEXT: orl %esi, %ecx
				; CHECK-NEXT: movzbl 3(%eax), %eax
				; CHECK-NEXT: orl %ecx, %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: popl %esi
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rdi), %ecx
				; CHECK64-NEXT: movl %ecx, %edx
				; CHECK64-NEXT: shll $16, %edx
				; CHECK64-NEXT: orl %eax, %edx
				; CHECK64-NEXT: movzbl 2(%rdi), %esi
				; CHECK64-NEXT: shll $8, %esi
				; CHECK64-NEXT: orl %edx, %esi
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %esi, %eax
				; CHECK64-NEXT: orl %ecx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				; Use individual part of the pattern outside of the pattern
				%tmp18 = or i32 %tmp6, %tmp17
				ret i32 %tmp18
				}

				; One of the loads is volatile
				; i8* p;
				; p0 = volatile *p;
				; ((i32) p0 << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap_volatile(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap_volatile:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movzbl (%eax), %ecx
				; CHECK-NEXT: shll $24, %ecx
				; CHECK-NEXT: movzbl 1(%eax), %edx
				; CHECK-NEXT: shll $16, %edx
				; CHECK-NEXT: orl %ecx, %edx
				; CHECK-NEXT: movzbl 2(%eax), %ecx
				; CHECK-NEXT: shll $8, %ecx
				; CHECK-NEXT: orl %edx, %ecx
				; CHECK-NEXT: movzbl 3(%eax), %eax
				; CHECK-NEXT: orl %ecx, %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_volatile:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rdi), %ecx
				; CHECK64-NEXT: shll $16, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 2(%rdi), %edx
				; CHECK64-NEXT: shll $8, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load volatile i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; There is a store in between individual loads
				; i8* p, q;
				; res1 = ((i32) p[0] << 24) \| ((i32) p[1] << 16)
				; *q = 0;
				; res2 = ((i32) p[2] << 8) \| (i32) p[3]
				; res1 \| res2
				define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
				; CHECK-LABEL: load_i32_by_i8_bswap_store_in_between:
				; CHECK: # BB#0:
				; CHECK-NEXT: pushl %esi
				; CHECK-NEXT: .Lcfi2:
				; CHECK-NEXT: .cfi_def_cfa_offset 8
				; CHECK-NEXT: .Lcfi3:
				; CHECK-NEXT: .cfi_offset %esi, -8
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movzbl (%ecx), %edx
				; CHECK-NEXT: shll $24, %edx
				; CHECK-NEXT: movzbl 1(%ecx), %esi
				; CHECK-NEXT: movl $0, (%eax)
				; CHECK-NEXT: shll $16, %esi
				; CHECK-NEXT: orl %edx, %esi
				; CHECK-NEXT: movzbl 2(%ecx), %edx
				; CHECK-NEXT: shll $8, %edx
				; CHECK-NEXT: orl %esi, %edx
				; CHECK-NEXT: movzbl 3(%ecx), %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: popl %esi
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rdi), %ecx
				; CHECK64-NEXT: movl $0, (%rsi)
				; CHECK64-NEXT: shll $16, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 2(%rdi), %edx
				; CHECK64-NEXT: shll $8, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp2 = load i8, i8* %tmp, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = shl nuw nsw i32 %tmp3, 24
				%tmp5 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp6 = load i8, i8* %tmp5, align 1
				; This store will prevent folding of the pattern
				store i32 0, i32* %arg1
				%tmp7 = zext i8 %tmp6 to i32
				%tmp8 = shl nuw nsw i32 %tmp7, 16
				%tmp9 = or i32 %tmp8, %tmp4
				%tmp10 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp11 = load i8, i8* %tmp10, align 1
				%tmp12 = zext i8 %tmp11 to i32
				%tmp13 = shl nuw nsw i32 %tmp12, 8
				%tmp14 = or i32 %tmp9, %tmp13
				%tmp15 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp16 = load i8, i8* %tmp15, align 1
				%tmp17 = zext i8 %tmp16 to i32
				%tmp18 = or i32 %tmp14, %tmp17
				ret i32 %tmp18
				}

				; One of the loads is from an unrelated location
				; i8* p, q;
				; ((i32) p[0] << 24) \| ((i32) q[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
				; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movzbl (%ecx), %edx
				; CHECK-NEXT: shll $24, %edx
				; CHECK-NEXT: movzbl 1(%eax), %eax
				; CHECK-NEXT: shll $16, %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: movzbl 2(%ecx), %edx
				; CHECK-NEXT: shll $8, %edx
				; CHECK-NEXT: orl %eax, %edx
				; CHECK-NEXT: movzbl 3(%ecx), %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rsi), %ecx
				; CHECK64-NEXT: shll $16, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 2(%rdi), %edx
				; CHECK64-NEXT: shll $8, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp2 = bitcast i32* %arg1 to i8*
				%tmp3 = load i8, i8* %tmp, align 1
				%tmp4 = zext i8 %tmp3 to i32
				%tmp5 = shl nuw nsw i32 %tmp4, 24
				; Load from an unrelated address
				%tmp6 = getelementptr inbounds i8, i8* %tmp2, i32 1
				%tmp7 = load i8, i8* %tmp6, align 1
				%tmp8 = zext i8 %tmp7 to i32
				%tmp9 = shl nuw nsw i32 %tmp8, 16
				%tmp10 = or i32 %tmp9, %tmp5
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i32
				%tmp14 = shl nuw nsw i32 %tmp13, 8
				%tmp15 = or i32 %tmp10, %tmp14
				%tmp16 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp17 = load i8, i8* %tmp16, align 1
				%tmp18 = zext i8 %tmp17 to i32
				%tmp19 = or i32 %tmp15, %tmp18
				ret i32 %tmp19
				}

				; Non-zero offsets are not supported for now
				; i8* p;
				; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
				define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_unsupported_offset:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movzbl 1(%eax), %ecx
				; CHECK-NEXT: movzbl 2(%eax), %edx
				; CHECK-NEXT: shll $8, %edx
				; CHECK-NEXT: orl %ecx, %edx
				; CHECK-NEXT: movzbl 3(%eax), %ecx
				; CHECK-NEXT: shll $16, %ecx
				; CHECK-NEXT: orl %edx, %ecx
				; CHECK-NEXT: movzbl 4(%eax), %eax
				; CHECK-NEXT: shll $24, %eax
				; CHECK-NEXT: orl %ecx, %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_unsupported_offset:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl 1(%rdi), %eax
				; CHECK64-NEXT: movzbl 2(%rdi), %ecx
				; CHECK64-NEXT: shll $8, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 3(%rdi), %edx
				; CHECK64-NEXT: shll $16, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 4(%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp2 = load i8, i8* %tmp1, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; i32 i;
				; ((i32) p[i] << 24) \| ((i32) p[i + 1] << 16) \| ((i32) p[i + 2] << 8) \| (i32) p[i + 3]
				define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
				; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movl (%ecx,%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_base_index_offset:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movslq %esi, %rax
				; CHECK64-NEXT: movzbl (%rdi,%rax), %ecx
				; CHECK64-NEXT: shll $24, %ecx
				; CHECK64-NEXT: movzbl 1(%rdi,%rax), %edx
				; CHECK64-NEXT: shll $16, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 2(%rdi,%rax), %ecx
				; CHECK64-NEXT: shll $8, %ecx
				; CHECK64-NEXT: orl %edx, %ecx
				; CHECK64-NEXT: movzbl 3(%rdi,%rax), %eax
				; CHECK64-NEXT: orl %ecx, %eax
				; CHECK64-NEXT: retq
				; TODO: Currently we don't fold the pattern for x86-64 target because we don't
				; see that the loads are adjacent. It happens because BaseIndexOffset doesn't
				; look through zexts.

				%tmp = bitcast i32* %arg to i8*
				%tmp2 = getelementptr inbounds i8, i8* %tmp, i32 %arg1
				%tmp3 = load i8, i8* %tmp2, align 1
				%tmp4 = zext i8 %tmp3 to i32
				%tmp5 = shl nuw nsw i32 %tmp4, 24
				%tmp6 = add nuw nsw i32 %arg1, 1
				%tmp7 = getelementptr inbounds i8, i8* %tmp, i32 %tmp6
				%tmp8 = load i8, i8* %tmp7, align 1
				%tmp9 = zext i8 %tmp8 to i32
				%tmp10 = shl nuw nsw i32 %tmp9, 16
				%tmp11 = or i32 %tmp10, %tmp5
				%tmp12 = add nuw nsw i32 %arg1, 2
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i32 %tmp12
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i32
				%tmp16 = shl nuw nsw i32 %tmp15, 8
				%tmp17 = or i32 %tmp11, %tmp16
				%tmp18 = add nuw nsw i32 %arg1, 3
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i32 %tmp18
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i32
				%tmp22 = or i32 %tmp17, %tmp21
				ret i32 %tmp22
				}

				; Verify that we don't crash handling shl i32 %conv57, 32
				define void @shift_i32_by_32(i8* %src1, i8* %src2, i64* %dst) {
				; CHECK-LABEL: shift_i32_by_32:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl $-1, 4(%eax)
				; CHECK-NEXT: movl $-1, (%eax)
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: shift_i32_by_32:
				; CHECK64: # BB#0: # %entry
				; CHECK64-NEXT: movq $-1, (%rdx)
				; CHECK64-NEXT: retq
				entry:
				%load1 = load i8, i8* %src1, align 1
				%conv46 = zext i8 %load1 to i32
				%shl47 = shl i32 %conv46, 56
				%or55 = or i32 %shl47, 0
				%load2 = load i8, i8* %src2, align 1
				%conv57 = zext i8 %load2 to i32
				%shl58 = shl i32 %conv57, 32
				%or59 = or i32 %or55, %shl58
				%or74 = or i32 %or59, 0
				%conv75 = sext i32 %or74 to i64
				store i64 %conv75, i64* %dst, align 8
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Match load by bytes idiom and fold it into a single load. Attempt #2.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 85709

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/trunk/test/CodeGen/AArch64/load-combine-big-endian.ll

llvm/trunk/test/CodeGen/AArch64/load-combine.ll

llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll

llvm/trunk/test/CodeGen/ARM/load-combine.ll

llvm/trunk/test/CodeGen/X86/load-combine.ll

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Match load by bytes idiom and fold it into a single load. Attempt #2.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 85709

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/trunk/test/CodeGen/AArch64/load-combine-big-endian.ll

llvm/trunk/test/CodeGen/AArch64/load-combine.ll

llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll

llvm/trunk/test/CodeGen/ARM/load-combine.ll

llvm/trunk/test/CodeGen/X86/load-combine.ll

[DAGCombiner] Match load by bytes idiom and fold it into a single load. Attempt #2.
ClosedPublic