Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4533,6 +4533,7 @@ SmallSet Loads; LoadSDNode *FirstLoad = nullptr; + int64_t FirstOffset = INT64_MAX; bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt; @@ -4575,8 +4576,10 @@ ByteOffsets[i] = ByteOffsetFromBase; // Remember the first byte load - if (ByteOffsetFromBase == 0) + if (ByteOffsetFromBase < FirstOffset) { FirstLoad = L; + FirstOffset = ByteOffsetFromBase; + } Loads.insert(L); } @@ -4588,8 +4591,9 @@ // little endian value load bool BigEndian = true, LittleEndian = true; for (unsigned i = 0; i < ByteWidth; i++) { - LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i); - BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i); + int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; + LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); + BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); if (!BigEndian && !LittleEndian) return SDValue(); } Index: test/CodeGen/AArch64/load-combine-big-endian.ll =================================================================== --- test/CodeGen/AArch64/load-combine-big-endian.ll +++ test/CodeGen/AArch64/load-combine-big-endian.ll @@ -191,3 +191,121 @@ %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldur w0, [x0, #1] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldur w0, [x0, #-4] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} Index: test/CodeGen/AArch64/load-combine.ll =================================================================== --- test/CodeGen/AArch64/load-combine.ll +++ test/CodeGen/AArch64/load-combine.ll @@ -178,3 +178,121 @@ %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldur w0, [x0, #1] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldur w0, [x0, #-4] +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} Index: test/CodeGen/ARM/load-combine-big-endian.ll =================================================================== --- test/CodeGen/ARM/load-combine-big-endian.ll +++ test/CodeGen/ARM/load-combine-big-endian.ll @@ -269,3 +269,151 @@ %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} Index: test/CodeGen/ARM/load-combine.ll =================================================================== --- test/CodeGen/ARM/load-combine.ll +++ test/CodeGen/ARM/load-combine.ll @@ -227,3 +227,151 @@ %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-ARMv6: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} Index: test/CodeGen/X86/load-combine.ll =================================================================== --- test/CodeGen/X86/load-combine.ll +++ test/CodeGen/X86/load-combine.ll @@ -571,37 +571,18 @@ ret i32 %tmp19 } -; Non-zero offsets are not supported for now ; i8* p; ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) -define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) { -; CHECK-LABEL: load_i32_by_i8_unsupported_offset: +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl 1(%eax), %ecx -; CHECK-NEXT: movzbl 2(%eax), %edx -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movzbl 3(%eax), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movzbl 4(%eax), %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movl 1(%eax), %eax ; CHECK-NEXT: retl ; -; CHECK64-LABEL: load_i32_by_i8_unsupported_offset: +; CHECK64-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK64: # BB#0: -; CHECK64-NEXT: movzbl 1(%rdi), %eax -; CHECK64-NEXT: movzbl 2(%rdi), %ecx -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: orl %eax, %ecx -; CHECK64-NEXT: movzbl 3(%rdi), %edx -; CHECK64-NEXT: shll $16, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: movzbl 4(%rdi), %eax -; CHECK64-NEXT: shll $24, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: movl 1(%rdi), %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* @@ -626,6 +607,118 @@ ret i32 %tmp18 } +; i8* p; +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl -4(%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_neg_offset: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl -4(%rdi), %eax +; CHECK64-NEXT: retq + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl 1(%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl 1(%rdi), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: retq + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl -4(%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl -4(%rdi), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: retq + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + ; i8* p; i32 i; ; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3] define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {