Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4541,6 +4541,7 @@ SmallSet Loads; LoadSDNode *FirstLoad = nullptr; + int64_t FirstOffset = INT64_MAX; bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt; @@ -4583,8 +4584,10 @@ ByteOffsets[i] = ByteOffsetFromBase; // Remember the first byte load - if (ByteOffsetFromBase == 0) + if (ByteOffsetFromBase < FirstOffset) { FirstLoad = L; + FirstOffset = ByteOffsetFromBase; + } Loads.insert(L); } @@ -4596,8 +4599,9 @@ // little endian value load bool BigEndian = true, LittleEndian = true; for (unsigned i = 0; i < ByteWidth; i++) { - LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i); - BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i); + int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; + LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); + BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); if (!BigEndian && !LittleEndian) return SDValue(); } Index: test/CodeGen/AArch64/load-combine-big-endian.ll =================================================================== --- test/CodeGen/AArch64/load-combine-big-endian.ll +++ test/CodeGen/AArch64/load-combine-big-endian.ll @@ -196,15 +196,10 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrb w10, [x0, #3] -; CHECK-NEXT: ldrb w11, [x0, #4] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 %tmp2 = load i8, i8* %tmp1, align 4 @@ -231,15 +226,10 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldurb w8, [x0, #-4] -; CHECK-NEXT: ldurb w9, [x0, #-3] -; CHECK-NEXT: ldurb w10, [x0, #-2] -; CHECK-NEXT: ldurb w11, [x0, #-1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 %tmp2 = load i8, i8* %tmp1, align 4 @@ -266,15 +256,9 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldrb w8, [x0, #4] -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: ldrb w10, [x0, #2] -; CHECK-NEXT: ldrb w11, [x0, #1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w0, [x0, #1] ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 %tmp2 = load i8, i8* %tmp1, align 1 @@ -301,15 +285,9 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldurb w8, [x0, #-1] -; CHECK-NEXT: ldurb w9, [x0, #-2] -; CHECK-NEXT: ldurb w10, [x0, #-3] -; CHECK-NEXT: ldurb w11, [x0, #-4] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w0, [x0, #-4] ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 %tmp2 = load i8, i8* %tmp1, align 1 Index: test/CodeGen/AArch64/load-combine.ll =================================================================== --- test/CodeGen/AArch64/load-combine.ll +++ test/CodeGen/AArch64/load-combine.ll @@ -183,15 +183,9 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrb w10, [x0, #3] -; CHECK-NEXT: ldrb w11, [x0, #4] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w0, [x0, #1] ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 %tmp2 = load i8, i8* %tmp1, align 4 @@ -218,15 +212,9 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldurb w8, [x0, #-4] -; CHECK-NEXT: ldurb w9, [x0, #-3] -; CHECK-NEXT: ldurb w10, [x0, #-2] -; CHECK-NEXT: ldurb w11, [x0, #-1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w0, [x0, #-4] ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 %tmp2 = load i8, i8* %tmp1, align 4 @@ -253,15 +241,10 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldrb w8, [x0, #4] -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: ldrb w10, [x0, #2] -; CHECK-NEXT: ldrb w11, [x0, #1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 %tmp2 = load i8, i8* %tmp1, align 1 @@ -288,15 +271,10 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldurb w8, [x0, #-1] -; CHECK-NEXT: ldurb w9, [x0, #-2] -; CHECK-NEXT: ldurb w10, [x0, #-3] -; CHECK-NEXT: ldurb w11, [x0, #-4] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 -; CHECK-NEXT: mov w0, w8 +; CHECK: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 ; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 %tmp2 = load i8, i8* %tmp1, align 1 Index: test/CodeGen/ARM/load-combine-big-endian.ll =================================================================== --- test/CodeGen/ARM/load-combine-big-endian.ll +++ test/CodeGen/ARM/load-combine-big-endian.ll @@ -274,23 +274,19 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldrb r1, [r0, #1] -; CHECK-NEXT: ldrb r2, [r0, #2] -; CHECK-NEXT: ldrb r3, [r0, #3] -; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK-ARMv6: ldrb r1, [r0, #1] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] -; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -319,23 +315,19 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldrb r1, [r0, #-4] -; CHECK-NEXT: ldrb r2, [r0, #-3] -; CHECK-NEXT: ldrb r3, [r0, #-2] -; CHECK-NEXT: ldrb r0, [r0, #-1] -; CHECK-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: -; CHECK-ARMv6: ldrb r1, [r0, #-4] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] -; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -364,23 +356,11 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldrb r1, [r0, #1] -; CHECK-NEXT: ldrb r2, [r0, #2] -; CHECK-NEXT: ldrb r3, [r0, #3] -; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK: ldr r0, [r0, #1] ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK-ARMv6: ldrb r1, [r0, #1] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] -; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #1] ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -409,23 +389,11 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldrb r1, [r0, #-4] -; CHECK-NEXT: ldrb r2, [r0, #-3] -; CHECK-NEXT: ldrb r3, [r0, #-2] -; CHECK-NEXT: ldrb r0, [r0, #-1] -; CHECK-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK: ldr r0, [r0, #-4] ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK-ARMv6: ldrb r1, [r0, #-4] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] -; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #-4] ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* Index: test/CodeGen/ARM/load-combine.ll =================================================================== --- test/CodeGen/ARM/load-combine.ll +++ test/CodeGen/ARM/load-combine.ll @@ -232,23 +232,11 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldrb r1, [r0, #1] -; CHECK-NEXT: ldrb r2, [r0, #2] -; CHECK-NEXT: ldrb r3, [r0, #3] -; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK: ldr r0, [r0, #1] ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK-ARMv6: ldrb r1, [r0, #1] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] -; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #1] ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -277,23 +265,11 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldrb r1, [r0, #-4] -; CHECK-NEXT: ldrb r2, [r0, #-3] -; CHECK-NEXT: ldrb r3, [r0, #-2] -; CHECK-NEXT: ldrb r0, [r0, #-1] -; CHECK-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK: ldr r0, [r0, #-4] ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: -; CHECK-ARMv6: ldrb r1, [r0, #-4] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] -; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 -; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #-4] ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -322,23 +298,19 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldrb r1, [r0, #1] -; CHECK-NEXT: ldrb r2, [r0, #2] -; CHECK-NEXT: ldrb r3, [r0, #3] -; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK-ARMv6: ldrb r1, [r0, #1] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] -; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* @@ -367,23 +339,19 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldrb r1, [r0, #-4] -; CHECK-NEXT: ldrb r2, [r0, #-3] -; CHECK-NEXT: ldrb r3, [r0, #-2] -; CHECK-NEXT: ldrb r0, [r0, #-1] -; CHECK-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 ; CHECK-NEXT: mov pc, lr ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK-ARMv6: ldrb r1, [r0, #-4] -; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] -; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] -; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 -; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 ; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* Index: test/CodeGen/X86/load-combine.ll =================================================================== --- test/CodeGen/X86/load-combine.ll +++ test/CodeGen/X86/load-combine.ll @@ -571,37 +571,18 @@ ret i32 %tmp19 } -; Non-zero offsets are not supported for now ; i8* p; ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl 1(%eax), %ecx -; CHECK-NEXT: movzbl 2(%eax), %edx -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movzbl 3(%eax), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movzbl 4(%eax), %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl 1(%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK64: # BB#0: -; CHECK64-NEXT: movzbl 1(%rdi), %eax -; CHECK64-NEXT: movzbl 2(%rdi), %ecx -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: orl %eax, %ecx -; CHECK64-NEXT: movzbl 3(%rdi), %edx -; CHECK64-NEXT: shll $16, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: movzbl 4(%rdi), %eax -; CHECK64-NEXT: shll $24, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: movl 1(%rdi), %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* @@ -632,30 +613,12 @@ ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl -4(%eax), %ecx -; CHECK-NEXT: movzbl -3(%eax), %edx -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movzbl -2(%eax), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movzbl -1(%eax), %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl -4(%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_neg_offset: ; CHECK64: # BB#0: -; CHECK64-NEXT: movzbl -4(%rdi), %eax -; CHECK64-NEXT: movzbl -3(%rdi), %ecx -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: orl %eax, %ecx -; CHECK64-NEXT: movzbl -2(%rdi), %edx -; CHECK64-NEXT: shll $16, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: movzbl -1(%rdi), %eax -; CHECK64-NEXT: shll $24, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: movl -4(%rdi), %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* @@ -686,30 +649,14 @@ ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl 4(%eax), %ecx -; CHECK-NEXT: movzbl 3(%eax), %edx -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movzbl 2(%eax), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movzbl 1(%eax), %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl 1(%eax), %eax +; CHECK-NEXT: bswapl %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK64: # BB#0: -; CHECK64-NEXT: movzbl 4(%rdi), %eax -; CHECK64-NEXT: movzbl 3(%rdi), %ecx -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: orl %eax, %ecx -; CHECK64-NEXT: movzbl 2(%rdi), %edx -; CHECK64-NEXT: shll $16, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: movzbl 1(%rdi), %eax -; CHECK64-NEXT: shll $24, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: movl 1(%rdi), %eax +; CHECK64-NEXT: bswapl %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* @@ -740,30 +687,14 @@ ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl -1(%eax), %ecx -; CHECK-NEXT: movzbl -2(%eax), %edx -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movzbl -3(%eax), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movzbl -4(%eax), %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl -4(%eax), %eax +; CHECK-NEXT: bswapl %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK64: # BB#0: -; CHECK64-NEXT: movzbl -1(%rdi), %eax -; CHECK64-NEXT: movzbl -2(%rdi), %ecx -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: orl %eax, %ecx -; CHECK64-NEXT: movzbl -3(%rdi), %edx -; CHECK64-NEXT: shll $16, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: movzbl -4(%rdi), %eax -; CHECK64-NEXT: shll $24, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: movl -4(%rdi), %eax +; CHECK64-NEXT: bswapl %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8*