This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Support non-zero offset in load combine
ClosedPublic

Authored by apilipenko on Feb 1 2017, 6:36 AM.

Download Raw Diff

Details

Reviewers

RKSimon
filcab
reames
javed.absar
hfinkel

Commits

rG4a64031954b6: [DAGCombiner] Support non-zero offset in load combine
rL294582: [DAGCombiner] Support non-zero offset in load combine

Summary

Enable folding patterns which load the value from non-zero offset:

i8 *a = ...
i32 val = a[4] | (a[5] << 8) | (a[6] << 16) | (a[7] << 24)

>

i32 val = *((i32*)(a+4))

Diff Detail

Event Timeline

apilipenko created this revision.Feb 1 2017, 6:36 AM

Herald added a reviewer: javed.absar. · View Herald TranscriptFeb 1 2017, 6:36 AM

apilipenko edited the summary of this revision. (Show Details)Feb 1 2017, 6:36 AM

boris.ulasevich added a subscriber: boris.ulasevich.Feb 2 2017, 12:05 AM

boris.ulasevich added inline comments.

test/CodeGen/AArch64/load-combine-big-endian.ll
203	Just a question.. This inefficient implementation should be optimised in spite of non-zero offset. How do we check the optimisation works Ok?

Please can you commit the new tests to trunk now with their current codegen and then update this patch to show the delta in codegen - I think this will answer @boris.ulasevich 's concerns.

The test cases have been landed separately (rL294185), rebase the patch.

apilipenko added a child revision: D29591: [DAGCombiner] Support {a|s}ext, {a|z|s}ext load nodes in load combine.Feb 7 2017, 7:16 AM

One minor

lib/CodeGen/SelectionDAG/DAGCombiner.cpp
4588	Add an assert that ensures FirstOffset has a sane value.

Add an assert as suggested.

LGTM

This revision is now accepted and ready to land.Feb 8 2017, 8:44 AM

Closed by commit rL294582: [DAGCombiner] Support non-zero offset in load combine (authored by apilipenko). · Explain WhyFeb 9 2017, 4:17 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

10 lines

test/

CodeGen/

AArch64/

load-combine-big-endian.ll

118 lines

load-combine.ll

118 lines

ARM/

load-combine-big-endian.ll

148 lines

load-combine.ll

148 lines

X86/

load-combine.ll

141 lines

Diff 86624

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,527 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](		std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
unsigned BW, unsigned i) { return BW - i - 1; };		unsigned BW, unsigned i) { return BW - i - 1; };

Optional<BaseIndexOffset> Base;		Optional<BaseIndexOffset> Base;
SDValue Chain;		SDValue Chain;

SmallSet<LoadSDNode *, 8> Loads;		SmallSet<LoadSDNode *, 8> Loads;
LoadSDNode *FirstLoad = nullptr;		LoadSDNode *FirstLoad = nullptr;
		int64_t FirstOffset = INT64_MAX;

bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();		bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt;		auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt;

// Check if all the bytes of the OR we are looking at are loaded from the same		// Check if all the bytes of the OR we are looking at are loaded from the same
// base address. Collect bytes offsets from Base address in ByteOffsets.		// base address. Collect bytes offsets from Base address in ByteOffsets.
SmallVector<int64_t, 4> ByteOffsets(ByteWidth);		SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
for (unsigned i = 0; i < ByteWidth; i++) {		for (unsigned i = 0; i < ByteWidth; i++) {
Show All 26 Lines	for (unsigned i = 0; i < ByteWidth; i++) {
assert(LoadBitWidth % 8 == 0 &&		assert(LoadBitWidth % 8 == 0 &&
"can only analyze providers for individual bytes not bit");		"can only analyze providers for individual bytes not bit");
unsigned LoadByteWidth = LoadBitWidth / 8;		unsigned LoadByteWidth = LoadBitWidth / 8;
int64_t MemoryByteOffset = ByteAt(LoadByteWidth, P->ByteOffset);		int64_t MemoryByteOffset = ByteAt(LoadByteWidth, P->ByteOffset);
int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset;		int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset;
ByteOffsets[i] = ByteOffsetFromBase;		ByteOffsets[i] = ByteOffsetFromBase;

// Remember the first byte load		// Remember the first byte load
if (ByteOffsetFromBase == 0)		if (ByteOffsetFromBase < FirstOffset) {
FirstLoad = L;		FirstLoad = L;
		FirstOffset = ByteOffsetFromBase;
		}

Loads.insert(L);		Loads.insert(L);
}		}
assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "		assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
"memory, so there must be at least one load which produces the value");		"memory, so there must be at least one load which produces the value");
assert(Base && "Base address of the accessed memory location must be set");		assert(Base && "Base address of the accessed memory location must be set");
		RKSimonUnsubmitted Not Done Reply Inline Actions Add an assert that ensures FirstOffset has a sane value. RKSimon: Add an assert that ensures FirstOffset has a sane value.

// Check if the bytes of the OR we are looking at match with either big or		// Check if the bytes of the OR we are looking at match with either big or
// little endian value load		// little endian value load
bool BigEndian = true, LittleEndian = true;		bool BigEndian = true, LittleEndian = true;
for (unsigned i = 0; i < ByteWidth; i++) {		for (unsigned i = 0; i < ByteWidth; i++) {
LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i);		int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i);		LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
		BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
if (!BigEndian && !LittleEndian)		if (!BigEndian && !LittleEndian)
return SDValue();		return SDValue();
}		}
assert((BigEndian != LittleEndian) && "should be either or");		assert((BigEndian != LittleEndian) && "should be either or");
assert(FirstLoad && "must be set");		assert(FirstLoad && "must be set");

// The node we are looking at matches with the pattern, check if we can		// The node we are looking at matches with the pattern, check if we can
// replace it with a single load and bswap if needed.		// replace it with a single load and bswap if needed.
▲ Show 20 Lines • Show All 9,991 Lines • Show Last 20 Lines

test/CodeGen/AArch64/load-combine-big-endian.ll

Show First 20 Lines • Show All 185 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%tmp32 = shl nuw nsw i64 %tmp31, 8		%tmp32 = shl nuw nsw i64 %tmp31, 8
%tmp33 = or i64 %tmp28, %tmp32		%tmp33 = or i64 %tmp28, %tmp32
%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7		%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
%tmp35 = load i8, i8* %tmp34, align 1		%tmp35 = load i8, i8* %tmp34, align 1
%tmp36 = zext i8 %tmp35 to i64		%tmp36 = zext i8 %tmp35 to i64
%tmp37 = or i64 %tmp33, %tmp36		%tmp37 = or i64 %tmp33, %tmp36
ret i64 %tmp37		ret i64 %tmp37
}		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
		define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
		; CHECK: ldur w8, [x0, #1]
		; CHECK-NEXT: rev w0, w8
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		boris.ulasevichUnsubmitted Not Done Reply Inline Actions Just a question.. This inefficient implementation should be optimised in spite of non-zero offset. How do we check the optimisation works Ok? boris.ulasevich: Just a question.. This inefficient implementation should be optimised in spite of non-zero…
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-4] \| ((i32) p[-3] << 8) \| ((i32) p[-2] << 16) \| ((i32) p[-1] << 24)
		define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset:
		; CHECK: ldur w8, [x0, #-4]
		; CHECK-NEXT: rev w0, w8
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[4] \| ((i32) p[3] << 8) \| ((i32) p[2] << 16) \| ((i32) p[1] << 24)
		define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK: ldur w0, [x0, #1]
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-1] \| ((i32) p[-2] << 8) \| ((i32) p[-3] << 16) \| ((i32) p[-4] << 24)
		define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK: ldur w0, [x0, #-4]
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

test/CodeGen/AArch64/load-combine.ll

Show First 20 Lines • Show All 172 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%tmp32 = shl nuw nsw i64 %tmp31, 8		%tmp32 = shl nuw nsw i64 %tmp31, 8
%tmp33 = or i64 %tmp28, %tmp32		%tmp33 = or i64 %tmp28, %tmp32
%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7		%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
%tmp35 = load i8, i8* %tmp34, align 1		%tmp35 = load i8, i8* %tmp34, align 1
%tmp36 = zext i8 %tmp35 to i64		%tmp36 = zext i8 %tmp35 to i64
%tmp37 = or i64 %tmp33, %tmp36		%tmp37 = or i64 %tmp33, %tmp36
ret i64 %tmp37		ret i64 %tmp37
}		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
		define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
		; CHECK: ldur w0, [x0, #1]
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-4] \| ((i32) p[-3] << 8) \| ((i32) p[-2] << 16) \| ((i32) p[-1] << 24)
		define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset:
		; CHECK: ldur w0, [x0, #-4]
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[4] \| ((i32) p[3] << 8) \| ((i32) p[2] << 16) \| ((i32) p[1] << 24)
		define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK: ldur w8, [x0, #1]
		; CHECK-NEXT: rev w0, w8
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-1] \| ((i32) p[-2] << 8) \| ((i32) p[-3] << 16) \| ((i32) p[-4] << 24)
		define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK: ldur w8, [x0, #-4]
		; CHECK-NEXT: rev w0, w8
		; CHECK-NEXT: ret

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

test/CodeGen/ARM/load-combine-big-endian.ll

Show First 20 Lines • Show All 263 Lines • ▼ Show 20 Lines	; CHECK-ARMv6: bx lr
%tmp32 = shl nuw nsw i64 %tmp31, 8		%tmp32 = shl nuw nsw i64 %tmp31, 8
%tmp33 = or i64 %tmp28, %tmp32		%tmp33 = or i64 %tmp28, %tmp32
%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7		%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
%tmp35 = load i8, i8* %tmp34, align 1		%tmp35 = load i8, i8* %tmp34, align 1
%tmp36 = zext i8 %tmp35 to i64		%tmp36 = zext i8 %tmp35 to i64
%tmp37 = or i64 %tmp33, %tmp36		%tmp37 = or i64 %tmp33, %tmp36
ret i64 %tmp37		ret i64 %tmp37
}		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
		define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
		; CHECK: ldr r0, [r0, #1]
		; CHECK-NEXT: mov r1, #65280
		; CHECK-NEXT: mov r2, #16711680
		; CHECK-NEXT: and r1, r1, r0, lsr #8
		; CHECK-NEXT: and r2, r2, r0, lsl #8
		; CHECK-NEXT: orr r1, r1, r0, lsr #24
		; CHECK-NEXT: orr r0, r2, r0, lsl #24
		; CHECK-NEXT: orr r0, r0, r1
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
		; CHECK-ARMv6: ldr r0, [r0, #1]
		; CHECK-ARMv6-NEXT: rev r0, r0
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-4] \| ((i32) p[-3] << 8) \| ((i32) p[-2] << 16) \| ((i32) p[-1] << 24)
		define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset:
		; CHECK: ldr r0, [r0, #-4]
		; CHECK-NEXT: mov r1, #65280
		; CHECK-NEXT: mov r2, #16711680
		; CHECK-NEXT: and r1, r1, r0, lsr #8
		; CHECK-NEXT: and r2, r2, r0, lsl #8
		; CHECK-NEXT: orr r1, r1, r0, lsr #24
		; CHECK-NEXT: orr r0, r2, r0, lsl #24
		; CHECK-NEXT: orr r0, r0, r1
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
		; CHECK-ARMv6: ldr r0, [r0, #-4]
		; CHECK-ARMv6-NEXT: rev r0, r0
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[4] \| ((i32) p[3] << 8) \| ((i32) p[2] << 16) \| ((i32) p[1] << 24)
		define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK: ldr r0, [r0, #1]
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK-ARMv6: ldr r0, [r0, #1]
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-1] \| ((i32) p[-2] << 8) \| ((i32) p[-3] << 16) \| ((i32) p[-4] << 24)
		define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK: ldr r0, [r0, #-4]
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK-ARMv6: ldr r0, [r0, #-4]
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

test/CodeGen/ARM/load-combine.ll

Show First 20 Lines • Show All 221 Lines • ▼ Show 20 Lines	; CHECK-ARMv6: bx lr
%tmp32 = shl nuw nsw i64 %tmp31, 8		%tmp32 = shl nuw nsw i64 %tmp31, 8
%tmp33 = or i64 %tmp28, %tmp32		%tmp33 = or i64 %tmp28, %tmp32
%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7		%tmp34 = getelementptr inbounds i8, i8* %tmp, i64 7
%tmp35 = load i8, i8* %tmp34, align 1		%tmp35 = load i8, i8* %tmp34, align 1
%tmp36 = zext i8 %tmp35 to i64		%tmp36 = zext i8 %tmp35 to i64
%tmp37 = or i64 %tmp33, %tmp36		%tmp37 = or i64 %tmp33, %tmp36
ret i64 %tmp37		ret i64 %tmp37
}		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
		define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
		; CHECK: ldr r0, [r0, #1]
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
		; CHECK-ARMv6: ldr r0, [r0, #1]
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-4] \| ((i32) p[-3] << 8) \| ((i32) p[-2] << 16) \| ((i32) p[-1] << 24)
		define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset:
		; CHECK: ldr r0, [r0, #-4]
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
		; CHECK-ARMv6: ldr r0, [r0, #-4]
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp2 = load i8, i8* %tmp1, align 4
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[1] is 4 byte aligned
		; (i32) p[4] \| ((i32) p[3] << 8) \| ((i32) p[2] << 16) \| ((i32) p[1] << 24)
		define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK: ldr r0, [r0, #1]
		; CHECK-NEXT: mov r1, #65280
		; CHECK-NEXT: mov r2, #16711680
		; CHECK-NEXT: and r1, r1, r0, lsr #8
		; CHECK-NEXT: and r2, r2, r0, lsl #8
		; CHECK-NEXT: orr r1, r1, r0, lsr #24
		; CHECK-NEXT: orr r0, r2, r0, lsl #24
		; CHECK-NEXT: orr r0, r0, r1
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK-ARMv6: ldr r0, [r0, #1]
		; CHECK-ARMv6-NEXT: rev r0, r0
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p; // p[-4] is 4 byte aligned
		; (i32) p[-1] \| ((i32) p[-2] << 8) \| ((i32) p[-3] << 16) \| ((i32) p[-4] << 24)
		define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK: ldr r0, [r0, #-4]
		; CHECK-NEXT: mov r1, #65280
		; CHECK-NEXT: mov r2, #16711680
		; CHECK-NEXT: and r1, r1, r0, lsr #8
		; CHECK-NEXT: and r2, r2, r0, lsl #8
		; CHECK-NEXT: orr r1, r1, r0, lsr #24
		; CHECK-NEXT: orr r0, r2, r0, lsl #24
		; CHECK-NEXT: orr r0, r0, r1
		; CHECK-NEXT: mov pc, lr

		; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK-ARMv6: ldr r0, [r0, #-4]
		; CHECK-ARMv6-NEXT: rev r0, r0
		; CHECK-ARMv6-NEXT: bx lr

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp15 = load i8, i8* %tmp14, align 4
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

test/CodeGen/X86/load-combine.ll

Show First 20 Lines • Show All 565 Lines • ▼ Show 20 Lines	; CHECK64-NEXT: retq
%tmp15 = or i32 %tmp10, %tmp14		%tmp15 = or i32 %tmp10, %tmp14
%tmp16 = getelementptr inbounds i8, i8* %tmp, i32 3		%tmp16 = getelementptr inbounds i8, i8* %tmp, i32 3
%tmp17 = load i8, i8* %tmp16, align 1		%tmp17 = load i8, i8* %tmp16, align 1
%tmp18 = zext i8 %tmp17 to i32		%tmp18 = zext i8 %tmp17 to i32
%tmp19 = or i32 %tmp15, %tmp18		%tmp19 = or i32 %tmp15, %tmp18
ret i32 %tmp19		ret i32 %tmp19
}		}

; Non-zero offsets are not supported for now
; i8* p;		; i8* p;
; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)		; (i32) p[1] \| ((i32) p[2] << 8) \| ((i32) p[3] << 16) \| ((i32) p[4] << 24)
define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) {		define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
; CHECK-LABEL: load_i32_by_i8_unsupported_offset:		; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl 1(%eax), %ecx		; CHECK-NEXT: movl 1(%eax), %eax
; CHECK-NEXT: movzbl 2(%eax), %edx
; CHECK-NEXT: shll $8, %edx
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: movzbl 3(%eax), %ecx
; CHECK-NEXT: shll $16, %ecx
; CHECK-NEXT: orl %edx, %ecx
; CHECK-NEXT: movzbl 4(%eax), %eax
; CHECK-NEXT: shll $24, %eax
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: retl		; CHECK-NEXT: retl
;		;
; CHECK64-LABEL: load_i32_by_i8_unsupported_offset:		; CHECK64-LABEL: load_i32_by_i8_nonzero_offset:
; CHECK64: # BB#0:		; CHECK64: # BB#0:
; CHECK64-NEXT: movzbl 1(%rdi), %eax		; CHECK64-NEXT: movl 1(%rdi), %eax
; CHECK64-NEXT: movzbl 2(%rdi), %ecx
; CHECK64-NEXT: shll $8, %ecx
; CHECK64-NEXT: orl %eax, %ecx
; CHECK64-NEXT: movzbl 3(%rdi), %edx
; CHECK64-NEXT: shll $16, %edx
; CHECK64-NEXT: orl %ecx, %edx
; CHECK64-NEXT: movzbl 4(%rdi), %eax
; CHECK64-NEXT: shll $24, %eax
; CHECK64-NEXT: orl %edx, %eax
; CHECK64-NEXT: retq		; CHECK64-NEXT: retq

%tmp = bitcast i32* %arg to i8*		%tmp = bitcast i32* %arg to i8*
%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
%tmp2 = load i8, i8* %tmp1, align 1		%tmp2 = load i8, i8* %tmp1, align 1
%tmp3 = zext i8 %tmp2 to i32		%tmp3 = zext i8 %tmp2 to i32
%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
%tmp5 = load i8, i8* %tmp4, align 1		%tmp5 = load i8, i8* %tmp4, align 1
%tmp6 = zext i8 %tmp5 to i32		%tmp6 = zext i8 %tmp5 to i32
%tmp7 = shl nuw nsw i32 %tmp6, 8		%tmp7 = shl nuw nsw i32 %tmp6, 8
%tmp8 = or i32 %tmp7, %tmp3		%tmp8 = or i32 %tmp7, %tmp3
%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
%tmp10 = load i8, i8* %tmp9, align 1		%tmp10 = load i8, i8* %tmp9, align 1
%tmp11 = zext i8 %tmp10 to i32		%tmp11 = zext i8 %tmp10 to i32
%tmp12 = shl nuw nsw i32 %tmp11, 16		%tmp12 = shl nuw nsw i32 %tmp11, 16
%tmp13 = or i32 %tmp8, %tmp12		%tmp13 = or i32 %tmp8, %tmp12
%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
%tmp15 = load i8, i8* %tmp14, align 1		%tmp15 = load i8, i8* %tmp14, align 1
%tmp16 = zext i8 %tmp15 to i32		%tmp16 = zext i8 %tmp15 to i32
%tmp17 = shl nuw nsw i32 %tmp16, 24		%tmp17 = shl nuw nsw i32 %tmp16, 24
%tmp18 = or i32 %tmp13, %tmp17		%tmp18 = or i32 %tmp13, %tmp17
ret i32 %tmp18		ret i32 %tmp18
}		}

		; i8* p;
		; (i32) p[-4] \| ((i32) p[-3] << 8) \| ((i32) p[-2] << 16) \| ((i32) p[-1] << 24)
		define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset:
		; CHECK: # BB#0:
		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
		; CHECK-NEXT: movl -4(%eax), %eax
		; CHECK-NEXT: retl
		;
		; CHECK64-LABEL: load_i32_by_i8_neg_offset:
		; CHECK64: # BB#0:
		; CHECK64-NEXT: movl -4(%rdi), %eax
		; CHECK64-NEXT: retq

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p;
		; (i32) p[4] \| ((i32) p[3] << 8) \| ((i32) p[2] << 16) \| ((i32) p[1] << 24)
		define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK: # BB#0:
		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
		; CHECK-NEXT: movl 1(%eax), %eax
		; CHECK-NEXT: bswapl %eax
		; CHECK-NEXT: retl
		;
		; CHECK64-LABEL: load_i32_by_i8_nonzero_offset_bswap:
		; CHECK64: # BB#0:
		; CHECK64-NEXT: movl 1(%rdi), %eax
		; CHECK64-NEXT: bswapl %eax
		; CHECK64-NEXT: retq

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

		; i8* p;
		; (i32) p[-1] \| ((i32) p[-2] << 8) \| ((i32) p[-3] << 16) \| ((i32) p[-4] << 24)
		define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
		; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK: # BB#0:
		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
		; CHECK-NEXT: movl -4(%eax), %eax
		; CHECK-NEXT: bswapl %eax
		; CHECK-NEXT: retl
		;
		; CHECK64-LABEL: load_i32_by_i8_neg_offset_bswap:
		; CHECK64: # BB#0:
		; CHECK64-NEXT: movl -4(%rdi), %eax
		; CHECK64-NEXT: bswapl %eax
		; CHECK64-NEXT: retq

		%tmp = bitcast i32* %arg to i8*
		%tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
		%tmp2 = load i8, i8* %tmp1, align 1
		%tmp3 = zext i8 %tmp2 to i32
		%tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
		%tmp5 = load i8, i8* %tmp4, align 1
		%tmp6 = zext i8 %tmp5 to i32
		%tmp7 = shl nuw nsw i32 %tmp6, 8
		%tmp8 = or i32 %tmp7, %tmp3
		%tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
		%tmp10 = load i8, i8* %tmp9, align 1
		%tmp11 = zext i8 %tmp10 to i32
		%tmp12 = shl nuw nsw i32 %tmp11, 16
		%tmp13 = or i32 %tmp8, %tmp12
		%tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
		%tmp15 = load i8, i8* %tmp14, align 1
		%tmp16 = zext i8 %tmp15 to i32
		%tmp17 = shl nuw nsw i32 %tmp16, 24
		%tmp18 = or i32 %tmp13, %tmp17
		ret i32 %tmp18
		}

; i8* p; i32 i;		; i8* p; i32 i;
; ((i32) p[i] << 24) \| ((i32) p[i + 1] << 16) \| ((i32) p[i + 2] << 8) \| (i32) p[i + 3]		; ((i32) p[i] << 24) \| ((i32) p[i + 1] << 16) \| ((i32) p[i + 2] << 8) \| (i32) p[i + 3]
define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {		define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset:		; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx		; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl (%ecx,%eax), %eax		; CHECK-NEXT: movl (%ecx,%eax), %eax
▲ Show 20 Lines • Show All 73 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Support non-zero offset in load combineClosedPublic

Details

>

Diff Detail

Event Timeline

Revision Contents

Diff 86624

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AArch64/load-combine-big-endian.ll

test/CodeGen/AArch64/load-combine.ll

test/CodeGen/ARM/load-combine-big-endian.ll

test/CodeGen/ARM/load-combine.ll

test/CodeGen/X86/load-combine.ll

[DAGCombiner] Support non-zero offset in load combine
ClosedPublic