Diff 84547

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 371 Lines • ▼ Show 20 Lines	private:
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,		SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);		bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);		SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,		SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
SDValue InnerPos, SDValue InnerNeg,		SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,		unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);		const SDLoc &DL);
SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);		SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
		SDValue MatchLoadCombine(SDNode *N);
SDValue ReduceLoadWidth(SDNode *N);		SDValue ReduceLoadWidth(SDNode *N);
SDValue ReduceLoadOpStoreWidth(SDNode *N);		SDValue ReduceLoadOpStoreWidth(SDNode *N);
SDValue splitMergedValStore(StoreSDNode *ST);		SDValue splitMergedValStore(StoreSDNode *ST);
SDValue TransformFPLoadStorePair(SDNode *N);		SDValue TransformFPLoadStorePair(SDNode *N);
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);		SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);		SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
SDValue reduceBuildVecToShuffle(SDNode *N);		SDValue reduceBuildVecToShuffle(SDNode *N);
SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask,		SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask,
▲ Show 20 Lines • Show All 3,585 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitOR(SDNode *N) {
if (N0.getOpcode() == N1.getOpcode())		if (N0.getOpcode() == N1.getOpcode())
if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))		if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
return Tmp;		return Tmp;

// See if this is some rotate idiom.		// See if this is some rotate idiom.
if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))		if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
return SDValue(Rot, 0);		return SDValue(Rot, 0);

		if (SDValue Load = MatchLoadCombine(N))
		return Load;

// Simplify the operands using demanded-bits information.		// Simplify the operands using demanded-bits information.
if (!VT.isVector() &&		if (!VT.isVector() &&
SimplifyDemandedBits(SDValue(N, 0)))		SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);		return SDValue(N, 0);

return SDValue();		return SDValue();
}		}

▲ Show 20 Lines • Show All 355 Lines • ▼ Show 20 Lines	static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG,
} else IsIndexSignExt = false;		} else IsIndexSignExt = false;

int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();		int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);		return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);
}		}
};		};
} // namespace		} // namespace

		namespace {
		/// Represents known origin of an individual byte in load combine pattern. The
		/// value of the byte is either constant zero or comes from memory.
		struct ByteProvider {
		// For constant zero providers Load is set to nullptr. For memory providers
		// Load represents the node which loads the byte from memory.
		// ByteOffset is the offset of the byte in the value produced by the load.
		LoadSDNode *Load;
		unsigned ByteOffset;

		ByteProvider() : Load(nullptr), ByteOffset(0) {}

		static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
		return ByteProvider(Load, ByteOffset);
		}
		static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }

		bool isConstantZero() { return !Load; }
		bool isMemory() { return Load; }

		bool operator==(const ByteProvider &Other) const {
		return Other.Load == Load && Other.ByteOffset == ByteOffset;
		}

		private:
		ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
		: Load(Load), ByteOffset(ByteOffset) {}
		};

		/// Recursively traverses the expression calculating the origin of the requested
		/// byte of the given value. Returns None if the provider can't be calculated.
		///
		/// For all the values except the root of the expression verifies that the value
		filcabUnsubmitted Not Done Reply Inline Actions Nit: We're not "collecting" any more. Maybe `{get,calculate,deduce}ByteProvider`? (Or something else) Also fix the comment. filcab: Nit: We're not "collecting" any more. Maybe `{get,calculate,deduce}ByteProvider`? (Or something…
		/// has exactly one use and if it's not true return None. This way if the origin
		/// of the byte is returned it's guaranteed that the values which contribute to
		/// the byte are not used outside of this expression.
		///
		/// Because the parts of the expression are not allowed to have more than one
		/// use this function iterates over trees, not DAGs. So it never visits the same
		/// node more than once.
		chandlercUnsubmitted Done Reply Inline Actions I agree this doesn't need a set because it is tree structured, but other similar routines at the SDAG layer bound recursions. See for example computeKnownBits. I suspect this code should do something similar. chandlerc: I agree this doesn't need a set because it is tree structured, but other similar routines at…
		const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
		unsigned Depth,
		bool Root = false) {
		// Typical i64 by i8 pattern requires recursion up to 8 calls depth
		if (Depth == 10)
		return None;

		if (!Root && !Op.hasOneUse())
		return None;

		assert(Op.getValueType().isScalarInteger() && "can't handle other types");
		unsigned BitWidth = Op.getValueSizeInBits();
		if (BitWidth % 8 != 0)
		return None;
		unsigned ByteWidth = BitWidth / 8;
		assert(Index < ByteWidth && "invalid index requested");

		switch (Op.getOpcode()) {
		case ISD::OR: {
		auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
		if (!LHS)
		return None;
		auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
		if (!RHS)
		return None;

		if (LHS->isConstantZero())
		return RHS;
		else if (RHS->isConstantZero())
		return LHS;
		else
		return None;
		}
		case ISD::SHL: {
		auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
		if (!ShiftOp)
		return None;

		uint64_t BitShift = ShiftOp->getZExtValue();
		if (BitShift % 8 != 0)
		return None;
		uint64_t ByteShift = BitShift / 8;

		return Index < ByteShift
		? ByteProvider::getConstantZero()
		: calculateByteProvider(Op->getOperand(0), Index - ByteShift,
		Depth + 1);
		}
		case ISD::ZERO_EXTEND: {
		SDValue NarrowOp = Op->getOperand(0);
		unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
		chandlercUnsubmitted Not Done Reply Inline Actions So, one path of what this is doing is essentially computing known-zero bits. I wonder if we should really be hand rolling that or whether we should instead use computeKnownBits. We could likely use computeKnownBits in the OR logic and then only handle recursing to find loads in the rest of these paths... Thoughts? maybe not worth it, hard to tell. chandlerc: So, one path of what this is doing is essentially computing known-zero bits. I wonder if we…
		apilipenkoAuthorUnsubmitted Not Done Reply Inline Actions It's definitely an option, but I can't justify the change. In follow up changes I'm going to exploit the information about zero bytes in DAGCombiner::MatchLoadCombine to handle partially available patterns. That will introduce another point where I'll need to use computeKnownBits. apilipenko: It's definitely an option, but I can't justify the change. In follow up changes I'm going to…
		if (NarrowBitWidth % 8 != 0)
		return None;
		uint64_t NarrowByteWidth = NarrowBitWidth / 8;

		return Index >= NarrowByteWidth
		? ByteProvider::getConstantZero()
		: calculateByteProvider(NarrowOp, Index, Depth + 1);
		}
		case ISD::LOAD: {
		auto L = cast<LoadSDNode>(Op.getNode());

		// TODO: support ext loads
		if (L->isVolatile() \|\| L->isIndexed() \|\|
		L->getExtensionType() != ISD::NON_EXTLOAD)
		return None;

		return ByteProvider::getMemory(L, Index);
		}
		}

		return None;
		}
		} // namespace

		/// Match a pattern where a wide type scalar value is loaded by several narrow
		/// loads and combined by shifts and ors. Fold it into a single load or a load
		/// and a BSWAP if the targets supports it.
		///
		/// Assuming little endian target:
		/// i8 *a = ...
		/// i32 val = a[0] \| (a[1] << 8) \| (a[2] << 16) \| (a[3] << 24)
		/// =>
		/// i32 val = *((i32)a)
		///
		/// i8 *a = ...
		RKSimonUnsubmitted Not Done Reply Inline Actions What is the effect of changing this to: if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) Would the legalize do such a bad job of splitting poorly combined loads/bswaps? RKSimon: What is the effect of changing this to: ``` if (LegalOperations && !TLI.isOperationLegal(ISD…
		apilipenkoAuthorUnsubmitted Not Done Reply Inline Actions This looks like a good idea, it enables combining of i64 pattern to two i32 loads on 32 bit targets (first loads are combined to a single i64 load and then it is split into to i32 loads). apilipenko: This looks like a good idea, it enables combining of i64 pattern to two i32 loads on 32 bit…
		/// i32 val = (a[0] << 24) \| (a[1] << 16) \| (a[2] << 8) \| a[3]
		/// =>
		/// i32 val = BSWAP(*((i32)a))
		///
		/// TODO: This rule matches complex patterns with OR node roots and doesn't
		/// interact well with the worklist mechanism. When a part of the pattern is
		/// updated (e.g. one of the loads) its direct users are put into the worklist,
		/// but the root node of the pattern which triggers the load combine is not
		/// necessarily a direct user of the changed node. For example, once the address
		/// of t28 load is reassociated load combine won't be triggered:
		/// t25: i32 = add t4, Constant:i32<2>
		chandlercUnsubmitted Not Done Reply Inline Actions Why only do the legality check when in the legalize phase? When would we want to combine loads to a non-legal integer type? Is the goal here to combine loads into a too-wide illegal integer type to let the legalizer then split them into legal sized chunks? If so, that at least needs a comment. And in that case, why only up to 64-bit integers? Alternatively, you could do the legality check in all phases and just never combine stores when the merged value is wider than the legal integer size. chandlerc: Why only do the legality check when in the legalize phase? When would we want to combine loads…
		apilipenkoAuthorUnsubmitted Not Done Reply Inline Actions Yes, the goal is to combine to a possibly too-wide load and let the legalizer split it later. This enables us to combine load i64 by i8 to a couple of i32 loads on 32 bit targets. Will add a comment. The i64 limitation is somewhat arbitrary just to limit the scope of the transformation. It can be lifted easily. (On the other hand with the newly introduced depth limit in calculateByteProvider we won't be able to fold patterns much wider than i64) apilipenko: Yes, the goal is to combine to a possibly too-wide load and let the legalizer split it later.
		/// t26: i64 = sign_extend t25
		/// t27: i64 = add t2, t26
		/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
		/// t29: i32 = zero_extend t28
		/// t32: i32 = shl t29, Constant:i8<8>
		/// t33: i32 = or t23, t32
		/// As a possible fix visitLoad can check if the load can be a part of a load
		/// combine pattern and add corresponding OR roots to the worklist.
		SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
		assert(N->getOpcode() == ISD::OR &&
		"Can only match load combining against OR nodes");
		RKSimonUnsubmitted Not Done Reply Inline Actions Are there any situations where we can use ZEXTLOAD? If so add a TODO comment? RKSimon: Are there any situations where we can use ZEXTLOAD? If so add a TODO comment?
		apilipenkoAuthorUnsubmitted Not Done Reply Inline Actions I'm going to add ext loads support in a follow up patch. Left a TODO for now. apilipenko: I'm going to add ext loads support in a follow up patch. Left a TODO for now.

		// Handles simple types only
		EVT VT = N->getValueType(0);
		if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
		return SDValue();
		unsigned ByteWidth = VT.getSizeInBits() / 8;

		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		RKSimonUnsubmitted Done Reply Inline Actions Avoid a comparison we know the result of: else if (Chain != LChain) RKSimon: Avoid a comparison we know the result of: ``` else if (Chain != LChain) ```
		// Before legalize we can introduce too wide illegal loads which will be later
		// split into legal sized loads. This enables us to combine i64 load by i8
		// patterns to a couple of i32 loads on 32 bit targets.
		if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
		return SDValue();

		auto LittleEndianByteAt = [](unsigned BW, unsigned i) { return i; };
		RKSimonUnsubmitted Done Reply Inline Actions Avoid a comparison we know the result of: else if (!Base->equalBaseIndex(Ptr)) RKSimon: Avoid a comparison we know the result of: ``` else if (!Base->equalBaseIndex(Ptr)) ```
		auto BigEndianByteAt = [](unsigned BW, unsigned i) { return BW - i - 1; };

		Optional<BaseIndexOffset> Base;
		SDValue Chain;

		SmallSet<LoadSDNode *, 8> Loads;
		LoadSDNode *FirstLoad = nullptr;

		bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
		RKSimonUnsubmitted Done Reply Inline Actions Loop invariant - pull this out to the top of the function: bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); RKSimon: Loop invariant - pull this out to the top of the function: ``` bool IsBigEndianTarget = DAG.
		filcabUnsubmitted Done Reply Inline Actions Might as well pull the `{Little,Big}EndianByteAt` from the loop too. filcab: Might as well pull the `{Little,Big}EndianByteAt` from the loop too.
		auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt;

		// Check if all the bytes of the OR we are looking at are loaded from the same
		// base address. Collect bytes offsets from Base address in ByteOffsets.
		SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
		for (unsigned i = 0; i < ByteWidth; i++) {
		auto P = calculateByteProvider(SDValue(N, 0), i, 0, /Root=/true);
		if (!P \|\| !P->isMemory()) // All the bytes must be loaded from memory
		return SDValue();

		LoadSDNode *L = P->Load;
		assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
		(L->getExtensionType() == ISD::NON_EXTLOAD) &&
		RKSimonUnsubmitted Done Reply Inline Actions Please make the assert messages more explanatory. RKSimon: Please make the assert messages more explanatory.
		"Must be enforced by calculateByteProvider");
		assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");

		// All loads must share the same chain
		SDValue LChain = L->getChain();
		if (!Chain)
		Chain = LChain;
		else if (Chain != LChain)
		return SDValue();

		// Loads must share the same base address
		BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
		if (!Base)
		Base = Ptr;
		else if (!Base->equalBaseIndex(Ptr))
		return SDValue();

		// Calculate the offset of the current byte from the base address
		unsigned LoadBitWidth = L->getMemoryVT().getSizeInBits();
		RKSimonUnsubmitted Not Done Reply Inline Actions Would this work? if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) RKSimon: Would this work? ``` if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP…
		filcabUnsubmitted Not Done Reply Inline Actions I wonder if it's useful to generate a bswap only to change it back later. Do you have an example of something llvm already does? Or would this be a future optimization possibility? filcab: I wonder if it's useful to generate a bswap only to change it back later. Do you have an…
		apilipenkoAuthorUnsubmitted Not Done Reply Inline Actions As a result we have a single load followed by an instruction sequence doing the swap. E.g. for load_i32_by_i8_bswap from test/CodeGen/ARM/load-combine.ll we'll have: ldr r0, [r0] mov r1, #65280 mov r2, #16711680 and r1, r1, r0, lsr #8 and r2, r2, r0, lsl #8 orr r1, r1, r0, lsr #24 orr r0, r2, r0, lsl #24 orr r0, r0, r1 instead of ldrb r2, [r0, #1] ldrb r1, [r0] ldrb r3, [r0, #2] ldrb r0, [r0, #3] lsl r2, r2, #16 orr r1, r2, r1, lsl #24 orr r1, r1, r3, lsl #8 orr r0, r1, r0 Assuming that shuffling bytes in a register is cheaper that loading from memory it looks like a generally good transformation. apilipenko: As a result we have a single load followed by an instruction sequence doing the swap. E.g. for…
		assert(LoadBitWidth % 8 == 0 &&
		"can only analyze providers for individual bytes not bit");
		unsigned LoadByteWidth = LoadBitWidth / 8;
		int64_t MemoryByteOffset = ByteAt(LoadByteWidth, P->ByteOffset);
		int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset;
		ByteOffsets[i] = ByteOffsetFromBase;

		// Remember the first byte load
		if (ByteOffsetFromBase == 0)
		FirstLoad = L;

		Loads.insert(L);
		}
		assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
		"memory, so there must be at least one load which produces the value");
		assert(Base && "Base address of the accessed memory location must be set");

		// Check if the bytes of the OR we are looking at match with either big or
		chandlercUnsubmitted Not Done Reply Inline Actions Shouldn't this check come first, up with the legalization stuff? chandlerc: Shouldn't this check come first, up with the legalization stuff?
		apilipenkoAuthorUnsubmitted Not Done Reply Inline Actions allowsMemoryAccess needs to know the address, addrspace and alignment specifically. We don't know it before we do all the computations above. apilipenko: allowsMemoryAccess needs to know the address, addrspace and alignment specifically. We don't…
		// little endian value load
		bool BigEndian = true, LittleEndian = true;
		for (unsigned i = 0; i < ByteWidth; i++) {
		LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i);
		BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i);
		if (!BigEndian && !LittleEndian)
		return SDValue();
		}
		assert((BigEndian != LittleEndian) && "should be either or");
		assert(FirstLoad && "must be set");

		// The node we are looking at matches with the pattern, check if we can
		// replace it with a single load and bswap if needed.

		// If the load needs byte swap check if the target supports it
		bool NeedsBswap = IsBigEndianTarget != BigEndian;

		// Before legalize we can introduce illegal bswaps which will be later
		// converted to an explicit bswap sequence. This way we end up with a single
		// load and byte shuffling instead of several loads and byte shuffling.
		if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
		return SDValue();

		// Check that a load of the wide type is both allowed and fast on the target
		bool Fast = false;
		bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
		VT, FirstLoad->getAddressSpace(),
		FirstLoad->getAlignment(), &Fast);
		if (!Allowed \|\| !Fast)
		return SDValue();

		SDValue NewLoad =
		DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
		FirstLoad->getPointerInfo(), FirstLoad->getAlignment());

		// Transfer chain users from old loads to the new load.
		for (LoadSDNode *L : Loads)
		DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));

		return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
		}

SDValue DAGCombiner::visitXOR(SDNode *N) {		SDValue DAGCombiner::visitXOR(SDNode *N) {
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();		EVT VT = N0.getValueType();

// fold vector ops		// fold vector ops
if (VT.isVector()) {		if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))		if (SDValue FoldedVOp = SimplifyVBinOp(N))
▲ Show 20 Lines • Show All 9,991 Lines • Show Last 20 Lines

test/CodeGen/AArch64/load-combine-big-endian.ll

This file was added.

				; RUN: llc < %s -mtriple=arm64eb-unknown \| FileCheck %s

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_big_endian:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) (((i16) p[0] << 8) \| (i16) p[1]) << 16) \| (i32) (((i16) p[3] << 8) \| (i16) p[4])
				define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp2, 8
				%tmp7 = or i16 %tmp6, %tmp5
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp10, 8
				%tmp15 = or i16 %tmp14, %tmp13
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp16, 16
				%tmp19 = or i32 %tmp18, %tmp17
				ret i32 %tmp19
				}

				; i16* p; // p is 4 byte aligned
				; ((i32) p[0] << 16) \| (i32) p[1]
				define i32 @load_i32_by_i16(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = load i16, i16* %tmp, align 4
				%tmp2 = zext i16 %tmp1 to i32
				%tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
				%tmp4 = load i16, i16* %tmp3, align 1
				%tmp5 = zext i16 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp2, 16
				%tmp7 = or i32 %tmp6, %tmp5
				ret i32 %tmp7
				}

				; i16* p_16; // p_16 is 4 byte aligned
				; i8* p_8 = (i8*) p_16;
				; (i32) (p_16[0] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i16_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_i8:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = bitcast i32* %arg to i8*
				%tmp2 = load i16, i16* %tmp, align 4
				%tmp3 = zext i16 %tmp2 to i32
				%tmp4 = shl nuw nsw i32 %tmp3, 16
				%tmp5 = getelementptr inbounds i8, i8* %tmp1, i32 2
				%tmp6 = load i8, i8* %tmp5, align 1
				%tmp7 = zext i8 %tmp6 to i32
				%tmp8 = shl nuw nsw i32 %tmp7, 8
				%tmp9 = getelementptr inbounds i8, i8* %tmp1, i32 3
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = or i32 %tmp8, %tmp11
				%tmp13 = or i32 %tmp12, %tmp4
				ret i32 %tmp13
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr x8, [x0]
				; CHECK-NEXT: rev x0, x8
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr x0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

test/CodeGen/AArch64/load-combine.ll

This file was added.

				; RUN: llc < %s -mtriple=arm64-unknown \| FileCheck %s

				; i8* p; // p is 1 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_unaligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_unaligned:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_aligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_aligned:
				; CHECK: ldr w0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 4
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: ldr w8, [x0]
				; CHECK-NEXT: rev w0, w8
				; CHECK-NEXT: ret
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr x0, [x0]
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr x8, [x0]
				; CHECK-NEXT: rev x0, x8
				; CHECK-NEXT: ret
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

test/CodeGen/ARM/load-combine-big-endian.ll

This file was added.

				; RUN: llc < %s -mtriple=armeb-unknown \| FileCheck %s
				; RUN: llc < %s -mtriple=armv6eb-unknown \| FileCheck %s --check-prefix=CHECK-ARMv6

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_big_endian:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_big_endian:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 4 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; BSWAP is not supported by 32 bit target
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: ldr r0, [r0]
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: rev r0, r0
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 4
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) (((i16) p[0] << 8) \| (i16) p[1]) << 16) \| (i32) (((i16) p[3] << 8) \| (i16) p[4])
				define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i16_by_i8_big_endian:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp2, 8
				%tmp7 = or i16 %tmp6, %tmp5
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp10, 8
				%tmp15 = or i16 %tmp14, %tmp13
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp16, 16
				%tmp19 = or i32 %tmp18, %tmp17
				ret i32 %tmp19
				}

				; i16* p; // p is 4 byte aligned
				; ((i32) p[0] << 16) \| (i32) p[1]
				define i32 @load_i32_by_i16(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i16:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = load i16, i16* %tmp, align 4
				%tmp2 = zext i16 %tmp1 to i32
				%tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
				%tmp4 = load i16, i16* %tmp3, align 1
				%tmp5 = zext i16 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp2, 16
				%tmp7 = or i32 %tmp6, %tmp5
				ret i32 %tmp7
				}

				; i16* p_16; // p_16 is 4 byte aligned
				; i8* p_8 = (i8*) p_16;
				; (i32) (p_16[0] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i16_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_i8:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i16_i8:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i16*
				%tmp1 = bitcast i32* %arg to i8*
				%tmp2 = load i16, i16* %tmp, align 4
				%tmp3 = zext i16 %tmp2 to i32
				%tmp4 = shl nuw nsw i32 %tmp3, 16
				%tmp5 = getelementptr inbounds i8, i8* %tmp1, i32 2
				%tmp6 = load i8, i8* %tmp5, align 1
				%tmp7 = zext i8 %tmp6 to i32
				%tmp8 = shl nuw nsw i32 %tmp7, 8
				%tmp9 = getelementptr inbounds i8, i8* %tmp1, i32 3
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = or i32 %tmp8, %tmp11
				%tmp13 = or i32 %tmp12, %tmp4
				ret i32 %tmp13
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr{{.*}}r0
				; CHECK: ldr{{.*}}r0
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
				; CHECK-ARMv6: ldrd r2, r3, [r0]
				; CHECK-ARMv6: rev r0, r3
				; CHECK-ARMv6: rev r1, r2
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr r2, [r0]
				; CHECK: ldr r1, [r0, #4]
				; CHECK: mov r0, r2
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8:
				; CHECK-ARMv6: ldrd r0, r1, [r0]
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

test/CodeGen/ARM/load-combine.ll

This file was added.

				; RUN: llc < %s -mtriple=arm-unknown \| FileCheck %s
				; RUN: llc < %s -mtriple=armv6-unknown \| FileCheck %s --check-prefix=CHECK-ARMv6
				RKSimonUnsubmitted Done Reply Inline Actions Additionally test with a armv6/7 cpu with REV? Same for big-endian tests RKSimon: Additionally test with a armv6/7 cpu with REV? Same for big-endian tests

				; i8* p; // p is 1 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_unaligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_unaligned:
				; CHECK: ldrb{{.*}}r0
				; CHECK: ldrb{{.*}}r0
				; CHECK: ldrb{{.*}}r0
				; CHECK: ldrb{{.*}}r0
				; CHECK: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_unaligned:
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: ldrb{{.*}}r0
				; CHECK-ARMv6: orr
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8_aligned(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_aligned:
				; CHECK: ldr r0, [r0]
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_aligned:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
				%tmp2 = load i8, i8* %tmp, align 4
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; // p is 4 byte aligned
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; BSWAP is not supported by 32 bit target
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: ldr r0, [r0]
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
				; CHECK-ARMv6: ldr r0, [r0]
				; CHECK-ARMv6-NEXT: rev r0, r0
				; CHECK-ARMv6-NEXT: bx lr
				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 4
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i8* p; // p is 8 byte aligned
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: ldr r2, [r0]
				; CHECK-NEXT: ldr r1, [r0, #4]
				; CHECK-NEXT: mov r0, r2
				; CHECK-NEXT: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8:
				; CHECK-ARMv6: ldrd r0, r1, [r0]
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p; // p is 8 byte aligned
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: ldr{{.*}}r0
				; CHECK: ldr{{.*}}r0
				; CHECK: and
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: and
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK-NEXT: orr
				; CHECK: mov pc, lr

				; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
				; CHECK-ARMv6: ldrd r2, r3, [r0]
				; CHECK-ARMv6: rev r0, r3
				; CHECK-ARMv6: rev r1, r2
				; CHECK-ARMv6: bx lr
				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 8
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

test/CodeGen/X86/load-combine.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s
				; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s --check-prefix=CHECK64

				; i8* p;
				; (i32) p[0] \| ((i32) p[1] << 8) \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp5, 8
				%tmp7 = or i32 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i32
				%tmp11 = shl nuw nsw i32 %tmp10, 16
				%tmp12 = or i32 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i32
				%tmp16 = shl nuw nsw i32 %tmp15, 24
				%tmp17 = or i32 %tmp12, %tmp16
				ret i32 %tmp17
				}

				; i8* p;
				; ((i32) p[0] << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: bswapl %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; i16* p;
				; (i32) p[0] \| ((i32) p[1] << 16)
				define i32 @load_i32_by_i16(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i16*
				%tmp1 = load i16, i16* %tmp, align 1
				%tmp2 = zext i16 %tmp1 to i32
				%tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
				%tmp4 = load i16, i16* %tmp3, align 1
				%tmp5 = zext i16 %tmp4 to i32
				%tmp6 = shl nuw nsw i32 %tmp5, 16
				%tmp7 = or i32 %tmp6, %tmp2
				ret i32 %tmp7
				}

				; i16* p_16;
				; i8* p_8 = (i8*) p_16;
				; (i32) p_16[0] \| ((i32) p[2] << 16) \| ((i32) p[3] << 24)
				define i32 @load_i32_by_i16_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i16*
				%tmp1 = bitcast i32* %arg to i8*
				%tmp2 = load i16, i16* %tmp, align 1
				%tmp3 = zext i16 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp1, i32 2
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = getelementptr inbounds i8, i8* %tmp1, i32 3
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i32
				%tmp11 = shl nuw nsw i32 %tmp10, 24
				%tmp12 = or i32 %tmp7, %tmp11
				%tmp13 = or i32 %tmp12, %tmp3
				ret i32 %tmp13
				}


				; i8* p;
				; (i32) ((i16) p[0] \| ((i16) p[1] << 8)) \| (((i32) ((i16) p[3] \| ((i16) p[4] << 8)) << 16)
				define i32 @load_i32_by_i16_by_i8(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16_by_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp5, 8
				%tmp7 = or i16 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp13, 8
				%tmp15 = or i16 %tmp14, %tmp10
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp17, 16
				%tmp19 = or i32 %tmp18, %tmp16
				ret i32 %tmp19
				}

				; i8* p;
				; ((i32) (((i16) p[0] << 8) \| (i16) p[1]) << 16) \| (i32) (((i16) p[3] << 8) \| (i16) p[4])
				define i32 @load_i32_by_i16_by_i8_bswap(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i16_by_i8_bswap:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i16_by_i8_bswap:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movl (%rdi), %eax
				; CHECK64-NEXT: bswapl %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i16
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i16
				%tmp6 = shl nuw nsw i16 %tmp2, 8
				%tmp7 = or i16 %tmp6, %tmp5
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i16
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i16
				%tmp14 = shl nuw nsw i16 %tmp10, 8
				%tmp15 = or i16 %tmp14, %tmp13
				%tmp16 = zext i16 %tmp7 to i32
				%tmp17 = zext i16 %tmp15 to i32
				%tmp18 = shl nuw nsw i32 %tmp16, 16
				%tmp19 = or i32 %tmp18, %tmp17
				ret i32 %tmp19
				}

				; i8* p;
				; (i64) p[0] \| ((i64) p[1] << 8) \| ((i64) p[2] << 16) \| ((i64) p[3] << 24) \| ((i64) p[4] << 32) \| ((i64) p[5] << 40) \| ((i64) p[6] << 48) \| ((i64) p[7] << 56)
				define i64 @load_i64_by_i8(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movl (%ecx), %eax
				; CHECK-NEXT: movl 4(%ecx), %edx
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i64_by_i8:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movq (%rdi), %rax
				; CHECK64-NEXT: retq

				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp4 = load i8, i8* %tmp3, align 1
				%tmp5 = zext i8 %tmp4 to i64
				%tmp6 = shl nuw nsw i64 %tmp5, 8
				%tmp7 = or i64 %tmp6, %tmp2
				%tmp8 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp9 = load i8, i8* %tmp8, align 1
				%tmp10 = zext i8 %tmp9 to i64
				%tmp11 = shl nuw nsw i64 %tmp10, 16
				%tmp12 = or i64 %tmp7, %tmp11
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i64
				%tmp16 = shl nuw nsw i64 %tmp15, 24
				%tmp17 = or i64 %tmp12, %tmp16
				%tmp18 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp19 = load i8, i8* %tmp18, align 1
				%tmp20 = zext i8 %tmp19 to i64
				%tmp21 = shl nuw nsw i64 %tmp20, 32
				%tmp22 = or i64 %tmp17, %tmp21
				%tmp23 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp24 = load i8, i8* %tmp23, align 1
				%tmp25 = zext i8 %tmp24 to i64
				%tmp26 = shl nuw nsw i64 %tmp25, 40
				%tmp27 = or i64 %tmp22, %tmp26
				%tmp28 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp29 = load i8, i8* %tmp28, align 1
				%tmp30 = zext i8 %tmp29 to i64
				%tmp31 = shl nuw nsw i64 %tmp30, 48
				%tmp32 = or i64 %tmp27, %tmp31
				%tmp33 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp34 = load i8, i8* %tmp33, align 1
				%tmp35 = zext i8 %tmp34 to i64
				%tmp36 = shl nuw i64 %tmp35, 56
				%tmp37 = or i64 %tmp32, %tmp36
				ret i64 %tmp37
				}

				; i8* p;
				; ((i64) p[0] << 56) \| ((i64) p[1] << 48) \| ((i64) p[2] << 40) \| ((i64) p[3] << 32) \| ((i64) p[4] << 24) \| ((i64) p[5] << 16) \| ((i64) p[6] << 8) \| (i64) p[7]
				define i64 @load_i64_by_i8_bswap(i64* %arg) {
				; CHECK-LABEL: load_i64_by_i8_bswap:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl (%eax), %edx
				; CHECK-NEXT: movl 4(%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: bswapl %edx
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i64_by_i8_bswap:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movq (%rdi), %rax
				; CHECK64-NEXT: bswapq %rax
				; CHECK64-NEXT: retq

				%tmp = bitcast i64* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i64
				%tmp3 = shl nuw i64 %tmp2, 56
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i64 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i64
				%tmp7 = shl nuw nsw i64 %tmp6, 48
				%tmp8 = or i64 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i64 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i64
				%tmp12 = shl nuw nsw i64 %tmp11, 40
				%tmp13 = or i64 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i64 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i64
				%tmp17 = shl nuw nsw i64 %tmp16, 32
				%tmp18 = or i64 %tmp13, %tmp17
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i64 4
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i64
				%tmp22 = shl nuw nsw i64 %tmp21, 24
				%tmp23 = or i64 %tmp18, %tmp22
				%tmp24 = getelementptr inbounds i8, i8* %tmp, i64 5
				%tmp25 = load i8, i8* %tmp24, align 1
				%tmp26 = zext i8 %tmp25 to i64
				%tmp27 = shl nuw nsw i64 %tmp26, 16
				%tmp28 = or i64 %tmp23, %tmp27
				%tmp29 = getelementptr inbounds i8, i8* %tmp, i64 6
				%tmp30 = load i8, i8* %tmp29, align 1
				%tmp31 = zext i8 %tmp30 to i64
				%tmp32 = shl nuw nsw i64 %tmp31, 8
				%tmp33 = or i64 %tmp28, %tmp32
				%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
				%tmp35 = load i8, i8* %tmp34, align 1
				%tmp36 = zext i8 %tmp35 to i64
				%tmp37 = or i64 %tmp33, %tmp36
				ret i64 %tmp37
				}

				; Part of the load by bytes pattern is used outside of the pattern
				; i8* p;
				; i32 x = (i32) p[1]
				; res = ((i32) p[0] << 24) \| (x << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				; x \| res
				define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap_uses:
				; CHECK: # BB#0:
				; CHECK-NEXT: pushl %esi
				; CHECK-NEXT: .Lcfi0:
				; CHECK-NEXT: .cfi_def_cfa_offset 8
				; CHECK-NEXT: .Lcfi1:
				; CHECK-NEXT: .cfi_offset %esi, -8
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movzbl (%eax), %ecx
				; CHECK-NEXT: shll $24, %ecx
				; CHECK-NEXT: movzbl 1(%eax), %edx
				; CHECK-NEXT: movl %edx, %esi
				; CHECK-NEXT: shll $16, %esi
				; CHECK-NEXT: orl %ecx, %esi
				; CHECK-NEXT: movzbl 2(%eax), %ecx
				; CHECK-NEXT: shll $8, %ecx
				; CHECK-NEXT: orl %esi, %ecx
				; CHECK-NEXT: movzbl 3(%eax), %eax
				; CHECK-NEXT: orl %ecx, %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: popl %esi
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rdi), %ecx
				; CHECK64-NEXT: movl %ecx, %edx
				; CHECK64-NEXT: shll $16, %edx
				; CHECK64-NEXT: orl %eax, %edx
				; CHECK64-NEXT: movzbl 2(%rdi), %esi
				; CHECK64-NEXT: shll $8, %esi
				; CHECK64-NEXT: orl %edx, %esi
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %esi, %eax
				; CHECK64-NEXT: orl %ecx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				; Use individual part of the pattern outside of the pattern
				%tmp18 = or i32 %tmp6, %tmp17
				ret i32 %tmp18
				}

				; One of the loads is volatile
				; i8* p;
				; p0 = volatile *p;
				; ((i32) p0 << 24) \| ((i32) p[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap_volatile(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_bswap_volatile:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movzbl (%eax), %ecx
				; CHECK-NEXT: shll $24, %ecx
				; CHECK-NEXT: movzbl 1(%eax), %edx
				; CHECK-NEXT: shll $16, %edx
				; CHECK-NEXT: orl %ecx, %edx
				; CHECK-NEXT: movzbl 2(%eax), %ecx
				; CHECK-NEXT: shll $8, %ecx
				; CHECK-NEXT: orl %edx, %ecx
				; CHECK-NEXT: movzbl 3(%eax), %eax
				; CHECK-NEXT: orl %ecx, %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_volatile:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rdi), %ecx
				; CHECK64-NEXT: shll $16, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 2(%rdi), %edx
				; CHECK64-NEXT: shll $8, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = load volatile i8, i8* %tmp, align 1
				%tmp2 = zext i8 %tmp1 to i32
				%tmp3 = shl nuw nsw i32 %tmp2, 24
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 16
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 8
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = or i32 %tmp13, %tmp16
				ret i32 %tmp17
				}

				; There is a store in between individual loads
				; i8* p, q;
				; res1 = ((i32) p[0] << 24) \| ((i32) p[1] << 16)
				; *q = 0;
				; res2 = ((i32) p[2] << 8) \| (i32) p[3]
				; res1 \| res2
				define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
				; CHECK-LABEL: load_i32_by_i8_bswap_store_in_between:
				; CHECK: # BB#0:
				; CHECK-NEXT: pushl %esi
				; CHECK-NEXT: .Lcfi2:
				; CHECK-NEXT: .cfi_def_cfa_offset 8
				; CHECK-NEXT: .Lcfi3:
				; CHECK-NEXT: .cfi_offset %esi, -8
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movzbl (%ecx), %edx
				; CHECK-NEXT: shll $24, %edx
				; CHECK-NEXT: movzbl 1(%ecx), %esi
				; CHECK-NEXT: movl $0, (%eax)
				; CHECK-NEXT: shll $16, %esi
				; CHECK-NEXT: orl %edx, %esi
				; CHECK-NEXT: movzbl 2(%ecx), %edx
				; CHECK-NEXT: shll $8, %edx
				; CHECK-NEXT: orl %esi, %edx
				; CHECK-NEXT: movzbl 3(%ecx), %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: popl %esi
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rdi), %ecx
				; CHECK64-NEXT: movl $0, (%rsi)
				; CHECK64-NEXT: shll $16, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 2(%rdi), %edx
				; CHECK64-NEXT: shll $8, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp2 = load i8, i8* %tmp, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = shl nuw nsw i32 %tmp3, 24
				%tmp5 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp6 = load i8, i8* %tmp5, align 1
				; This store will prevent folding of the pattern
				store i32 0, i32* %arg1
				%tmp7 = zext i8 %tmp6 to i32
				%tmp8 = shl nuw nsw i32 %tmp7, 16
				%tmp9 = or i32 %tmp8, %tmp4
				%tmp10 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp11 = load i8, i8* %tmp10, align 1
				%tmp12 = zext i8 %tmp11 to i32
				%tmp13 = shl nuw nsw i32 %tmp12, 8
				%tmp14 = or i32 %tmp9, %tmp13
				%tmp15 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp16 = load i8, i8* %tmp15, align 1
				%tmp17 = zext i8 %tmp16 to i32
				%tmp18 = or i32 %tmp14, %tmp17
				ret i32 %tmp18
				}

				; One of the loads is from an unrelated location
				; i8* p, q;
				; ((i32) p[0] << 24) \| ((i32) q[1] << 16) \| ((i32) p[2] << 8) \| (i32) p[3]
				define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
				; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movzbl (%ecx), %edx
				; CHECK-NEXT: shll $24, %edx
				; CHECK-NEXT: movzbl 1(%eax), %eax
				; CHECK-NEXT: shll $16, %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: movzbl 2(%ecx), %edx
				; CHECK-NEXT: shll $8, %edx
				; CHECK-NEXT: orl %eax, %edx
				; CHECK-NEXT: movzbl 3(%ecx), %eax
				; CHECK-NEXT: orl %edx, %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl (%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: movzbl 1(%rsi), %ecx
				; CHECK64-NEXT: shll $16, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 2(%rdi), %edx
				; CHECK64-NEXT: shll $8, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 3(%rdi), %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp2 = bitcast i32* %arg1 to i8*
				%tmp3 = load i8, i8* %tmp, align 1
				%tmp4 = zext i8 %tmp3 to i32
				%tmp5 = shl nuw nsw i32 %tmp4, 24
				; Load from an unrelated address
				%tmp6 = getelementptr inbounds i8, i8* %tmp2, i32 1
				%tmp7 = load i8, i8* %tmp6, align 1
				%tmp8 = zext i8 %tmp7 to i32
				%tmp9 = shl nuw nsw i32 %tmp8, 16
				%tmp10 = or i32 %tmp9, %tmp5
				%tmp11 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp12 = load i8, i8* %tmp11, align 1
				%tmp13 = zext i8 %tmp12 to i32
				%tmp14 = shl nuw nsw i32 %tmp13, 8
				%tmp15 = or i32 %tmp10, %tmp14
				%tmp16 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp17 = load i8, i8* %tmp16, align 1
				%tmp18 = zext i8 %tmp17 to i32
				%tmp19 = or i32 %tmp15, %tmp18
				ret i32 %tmp19
				}

				; Non-zero offsets are not supported for now
				; i8* p;
				; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
				define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) {
				; CHECK-LABEL: load_i32_by_i8_unsupported_offset:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movzbl 1(%eax), %ecx
				; CHECK-NEXT: movzbl 2(%eax), %edx
				; CHECK-NEXT: shll $8, %edx
				; CHECK-NEXT: orl %ecx, %edx
				; CHECK-NEXT: movzbl 3(%eax), %ecx
				; CHECK-NEXT: shll $16, %ecx
				; CHECK-NEXT: orl %edx, %ecx
				; CHECK-NEXT: movzbl 4(%eax), %eax
				; CHECK-NEXT: shll $24, %eax
				; CHECK-NEXT: orl %ecx, %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_unsupported_offset:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movzbl 1(%rdi), %eax
				; CHECK64-NEXT: movzbl 2(%rdi), %ecx
				; CHECK64-NEXT: shll $8, %ecx
				; CHECK64-NEXT: orl %eax, %ecx
				; CHECK64-NEXT: movzbl 3(%rdi), %edx
				; CHECK64-NEXT: shll $16, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 4(%rdi), %eax
				; CHECK64-NEXT: shll $24, %eax
				; CHECK64-NEXT: orl %edx, %eax
				; CHECK64-NEXT: retq

				%tmp = bitcast i32* %arg to i8*
				%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
				%tmp2 = load i8, i8* %tmp1, align 1
				%tmp3 = zext i8 %tmp2 to i32
				%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
				%tmp5 = load i8, i8* %tmp4, align 1
				%tmp6 = zext i8 %tmp5 to i32
				%tmp7 = shl nuw nsw i32 %tmp6, 8
				%tmp8 = or i32 %tmp7, %tmp3
				%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
				%tmp10 = load i8, i8* %tmp9, align 1
				%tmp11 = zext i8 %tmp10 to i32
				%tmp12 = shl nuw nsw i32 %tmp11, 16
				%tmp13 = or i32 %tmp8, %tmp12
				%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
				%tmp15 = load i8, i8* %tmp14, align 1
				%tmp16 = zext i8 %tmp15 to i32
				%tmp17 = shl nuw nsw i32 %tmp16, 24
				%tmp18 = or i32 %tmp13, %tmp17
				ret i32 %tmp18
				}

				; i8* p; i32 i;
				; ((i32) p[i] << 24) \| ((i32) p[i + 1] << 16) \| ((i32) p[i + 2] << 8) \| (i32) p[i + 3]
				define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
				; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; CHECK-NEXT: movl (%ecx,%eax), %eax
				; CHECK-NEXT: bswapl %eax
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: load_i32_by_i8_bswap_base_index_offset:
				; CHECK64: # BB#0:
				; CHECK64-NEXT: movslq %esi, %rax
				; CHECK64-NEXT: movzbl (%rdi,%rax), %ecx
				; CHECK64-NEXT: shll $24, %ecx
				; CHECK64-NEXT: movzbl 1(%rdi,%rax), %edx
				; CHECK64-NEXT: shll $16, %edx
				; CHECK64-NEXT: orl %ecx, %edx
				; CHECK64-NEXT: movzbl 2(%rdi,%rax), %ecx
				; CHECK64-NEXT: shll $8, %ecx
				; CHECK64-NEXT: orl %edx, %ecx
				; CHECK64-NEXT: movzbl 3(%rdi,%rax), %eax
				; CHECK64-NEXT: orl %ecx, %eax
				; CHECK64-NEXT: retq
				; TODO: Currently we don't fold the pattern for x86-64 target because we don't
				; see that the loads are adjacent. It happens because BaseIndexOffset doesn't
				RKSimonUnsubmitted Done Reply Inline Actions Add a TODO to the comment RKSimon: Add a TODO to the comment
				; look through zexts.

				%tmp = bitcast i32* %arg to i8*
				%tmp2 = getelementptr inbounds i8, i8* %tmp, i32 %arg1
				%tmp3 = load i8, i8* %tmp2, align 1
				%tmp4 = zext i8 %tmp3 to i32
				%tmp5 = shl nuw nsw i32 %tmp4, 24
				%tmp6 = add nuw nsw i32 %arg1, 1
				%tmp7 = getelementptr inbounds i8, i8* %tmp, i32 %tmp6
				%tmp8 = load i8, i8* %tmp7, align 1
				%tmp9 = zext i8 %tmp8 to i32
				%tmp10 = shl nuw nsw i32 %tmp9, 16
				%tmp11 = or i32 %tmp10, %tmp5
				%tmp12 = add nuw nsw i32 %arg1, 2
				%tmp13 = getelementptr inbounds i8, i8* %tmp, i32 %tmp12
				%tmp14 = load i8, i8* %tmp13, align 1
				%tmp15 = zext i8 %tmp14 to i32
				%tmp16 = shl nuw nsw i32 %tmp15, 8
				%tmp17 = or i32 %tmp11, %tmp16
				%tmp18 = add nuw nsw i32 %arg1, 3
				%tmp19 = getelementptr inbounds i8, i8* %tmp, i32 %tmp18
				%tmp20 = load i8, i8* %tmp19, align 1
				%tmp21 = zext i8 %tmp20 to i32
				%tmp22 = or i32 %tmp17, %tmp21
				ret i32 %tmp22
				}

				; Verify that we don't crash handling shl i32 %conv57, 32
				define void @shift_i32_by_32(i8* %src1, i8* %src2, i64* %dst) {
				; CHECK-LABEL: shift_i32_by_32:
				; CHECK: # BB#0: # %entry
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movl $-1, 4(%eax)
				; CHECK-NEXT: movl $-1, (%eax)
				; CHECK-NEXT: retl
				;
				; CHECK64-LABEL: shift_i32_by_32:
				; CHECK64: # BB#0: # %entry
				; CHECK64-NEXT: movq $-1, (%rdx)
				; CHECK64-NEXT: retq
				entry:
				%load1 = load i8, i8* %src1, align 1
				%conv46 = zext i8 %load1 to i32
				%shl47 = shl i32 %conv46, 56
				%or55 = or i32 %shl47, 0
				%load2 = load i8, i8* %src2, align 1
				%conv57 = zext i8 %load2 to i32
				%shl58 = shl i32 %conv57, 32
				%or59 = or i32 %or55, %shl58
				%or74 = or i32 %or59, 0
				%conv75 = sext i32 %or74 to i64
				store i64 %conv75, i64* %dst, align 8
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Match load by bytes idiom and fold it into a single load. Attempt #2.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 84547

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AArch64/load-combine-big-endian.ll

test/CodeGen/AArch64/load-combine.ll

test/CodeGen/ARM/load-combine-big-endian.ll

test/CodeGen/ARM/load-combine.ll

test/CodeGen/X86/load-combine.ll

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Match load by bytes idiom and fold it into a single load. Attempt #2.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 84547

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AArch64/load-combine-big-endian.ll

test/CodeGen/AArch64/load-combine.ll

test/CodeGen/ARM/load-combine-big-endian.ll

test/CodeGen/ARM/load-combine.ll

test/CodeGen/X86/load-combine.ll

[DAGCombiner] Match load by bytes idiom and fold it into a single load. Attempt #2.
ClosedPublic