diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6418,7 +6418,7 @@ // Check if the bytes offsets we are looking at match with either big or // little endian value loaded. Return true for big endian, false for little // endian, and None if match failed. -static Optional isBigEndian(const SmallVector &ByteOffsets, +static Optional isBigEndian(const ArrayRef ByteOffsets, int64_t FirstOffset) { // The endian can be decided only when it is 2 bytes at least. unsigned Width = ByteOffsets.size(); @@ -6498,7 +6498,7 @@ // to the same base address. Collect bytes offsets from Base address into // ByteOffsets. SDValue CombinedValue; - SmallVector ByteOffsets(Width, INT64_MAX); + SmallVector ByteOffsets(Width, INT64_MAX); int64_t FirstOffset = INT64_MAX; StoreSDNode *FirstStore = nullptr; Optional Base; @@ -6649,17 +6649,11 @@ "Can only match load combining against OR nodes"); // Handles simple types only - EVT VT = N->getValueType(0); + const EVT VT = N->getValueType(0); if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); unsigned ByteWidth = VT.getSizeInBits() / 8; - // Before legalize we can introduce too wide illegal loads which will be later - // split into legal sized loads. This enables us to combine i64 load by i8 - // patterns to a couple of i32 loads on 32 bit targets. - if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) - return SDValue(); - bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); @@ -6681,12 +6675,23 @@ // Check if all the bytes of the OR we are looking at are loaded from the same // base address. Collect bytes offsets from Base address in ByteOffsets. - SmallVector ByteOffsets(ByteWidth); - for (unsigned i = 0; i < ByteWidth; i++) { + SmallVector ByteOffsets(ByteWidth); + unsigned ZeroExtendedBytes = 0; + for (int i = ByteWidth - 1; i >= 0; --i) { auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); - if (!P || !P->isMemory()) // All the bytes must be loaded from memory + if (!P) return SDValue(); + if (P->isConstantZero()) { + // It's OK for the N most significant bytes to be 0, we can just + // zero-extend the load. + if (++ZeroExtendedBytes != ByteWidth - static_cast(i)) + return SDValue(); + continue; + } + + assert(P->isMemory() && "unimplemented"); + LoadSDNode *L = P->Load; assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && !L->isIndexed() && @@ -6725,9 +6730,21 @@ assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + const bool NeedsZext = ZeroExtendedBytes > 0; + + const EVT MemVT = + EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); + + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, MemVT)) + return SDValue(); + // Check if the bytes of the OR we are looking at match with either big or // little endian value load - Optional IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); + Optional IsBigEndian = isBigEndian( + makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); if (!IsBigEndian.hasValue()) return SDValue(); @@ -6740,10 +6757,11 @@ LoadSDNode *FirstLoad = FirstByteProvider->Load; // The node we are looking at matches with the pattern, check if we can - // replace it with a single load and bswap if needed. + // replace it with a single (possibly zero-extended) load and bswap + shift if + // needed. // If the load needs byte swap check if the target supports it - bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; + const bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single @@ -6751,22 +6769,39 @@ if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) return SDValue(); + // If we need to bswap and zero extend, we have to insert a shift. Check that + // it is legal. + if (NeedsBswap && NeedsZext && LegalOperations && + !TLI.isOperationLegal(ISD::SHL, VT)) + return SDValue(); + // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; - bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - VT, *FirstLoad->getMemOperand(), &Fast); + const bool Allowed = + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + *FirstLoad->getMemOperand(), &Fast); if (!Allowed || !Fast) return SDValue(); - SDValue NewLoad = - DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, + SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), MemVT, + FirstLoad->getAlignment()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); - return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; + if (!NeedsBswap) + return NewLoad; + + SDValue ShiftedLoad = + NeedsZext + ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, + DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, + SDLoc(N), LegalOperations)) + : NewLoad; + return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); } // If the target has andn, bsl, or a similar bit-select instruction, diff --git a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll --- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -1,11 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64eb-unknown | FileCheck %s ; i8* p; // p is 4 byte aligned ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i8_big_endian(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_big_endian: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = load i8, i8* %tmp, align 4 %tmp2 = zext i8 %tmp1 to i32 @@ -31,8 +33,9 @@ ; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) { ; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = load i8, i8* %tmp, align 4 %tmp2 = zext i8 %tmp1 to i16 @@ -60,8 +63,9 @@ ; ((i32) p[0] << 16) | (i32) p[1] define i32 @load_i32_by_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_i16: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp2 = zext i16 %tmp1 to i32 @@ -78,8 +82,9 @@ ; (i32) (p_16[0] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i16_i8(i32* %arg) { ; CHECK-LABEL: load_i32_by_i16_i8: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i16* %tmp1 = bitcast i32* %arg to i8* %tmp2 = load i16, i16* %tmp, align 4 @@ -101,9 +106,10 @@ ; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) define i64 @load_i64_by_i8_bswap(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: -; CHECK: ldr x8, [x0] -; CHECK-NEXT: rev x0, x8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: rev x0, x8 +; CHECK-NEXT: ret %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -149,8 +155,9 @@ ; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] define i64 @load_i64_by_i8(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8: -; CHECK: ldr x0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -196,9 +203,10 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldur w8, [x0, #1] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -226,9 +234,10 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldur w8, [x0, #-4] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 @@ -256,8 +265,9 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldur w0, [x0, #1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w0, [x0, #1] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 @@ -285,8 +295,9 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldur w0, [x0, #-4] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w0, [x0, #-4] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 @@ -316,9 +327,10 @@ ; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16) define i32 @load_i32_by_bswap_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: -; CHECK: ldr w8, [x0] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1) @@ -336,8 +348,9 @@ ; (i32) p[1] | (sext(p[0] << 16) to i32) define i32 @load_i32_by_sext_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_sext_i16: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp2 = sext i16 %tmp1 to i32 @@ -354,10 +367,11 @@ ; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: -; CHECK: add x8, x0, w1, uxtw -; CHECK-NEXT: ldr w8, [x8, #12] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, uxtw +; CHECK-NEXT: ldr w8, [x8, #12] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = add nuw nsw i32 %i, 3 %tmp2 = add nuw nsw i32 %i, 2 %tmp3 = add nuw nsw i32 %i, 1 @@ -392,10 +406,11 @@ ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK: add x8, x0, w1, uxtw -; CHECK-NEXT: ldur w8, [x8, #13] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, uxtw +; CHECK-NEXT: ldur w8, [x8, #13] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 %tmp3 = add nuw nsw i32 %i, 2 @@ -429,11 +444,11 @@ ; (i32) p[0] | ((i32) p[1] << 8) define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: -; CHECK: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -451,11 +466,12 @@ ; ((i32) p[0] << 8) | ((i32) p[1] << 16) define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_8: -; CHECK: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -474,11 +490,12 @@ ; ((i32) p[0] << 16) | ((i32) p[1] << 24) define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_16: -; CHECK: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -496,11 +513,9 @@ ; (i32) p[1] | ((i32) p[0] << 8) define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -518,11 +533,12 @@ ; ((i32) p[1] << 8) | ((i32) p[0] << 16) define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -541,11 +557,12 @@ ; ((i32) p[1] << 16) | ((i32) p[0] << 24) define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -568,10 +585,11 @@ ; because in the original DAG we don't have p[1] address available define i16 @load_i16_from_nonzero_offset(i8* %p) { ; CHECK-LABEL: load_i16_from_nonzero_offset: -; CHECK: ldrh w8, [x0] -; CHECK-NEXT: ldrb w0, [x0, #2] -; CHECK-NEXT: bfi w0, w8, #8, #24 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrb w0, [x0, #2] +; CHECK-NEXT: bfi w0, w8, #8, #24 +; CHECK-NEXT: ret %p1.i16 = bitcast i8* %p to i16* %p2.i8 = getelementptr i8, i8* %p, i64 2 diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -1,11 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-unknown | FileCheck %s ; i8* p; // p is 1 byte aligned ; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) define i32 @load_i32_by_i8_unaligned(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_unaligned: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 %tmp2 = load i8, i8* %tmp1, align 1 @@ -32,8 +34,9 @@ ; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) define i32 @load_i32_by_i8_aligned(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_aligned: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 %tmp2 = load i8, i8* %tmp1, align 4 @@ -60,9 +63,10 @@ ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_bswap: -; CHECK: ldr w8, [x0] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = load i8, i8* %tmp, align 4 %tmp2 = zext i8 %tmp1 to i32 @@ -88,8 +92,9 @@ ; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) define i64 @load_i64_by_i8(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8: -; CHECK: ldr x0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -135,9 +140,10 @@ ; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] define i64 @load_i64_by_i8_bswap(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: -; CHECK: ldr x8, [x0] -; CHECK-NEXT: rev x0, x8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: rev x0, x8 +; CHECK-NEXT: ret %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -183,8 +189,9 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldur w0, [x0, #1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w0, [x0, #1] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -212,8 +219,9 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldur w0, [x0, #-4] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w0, [x0, #-4] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 @@ -241,9 +249,10 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldur w8, [x0, #1] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 @@ -271,9 +280,10 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldur w8, [x0, #-4] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #-4] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 @@ -303,9 +313,10 @@ ; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16) define i32 @load_i32_by_bswap_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: -; CHECK: ldr w8, [x0] -; CHECK-NEXT: rev w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 @@ -324,8 +335,9 @@ ; (i32) p[0] | (sext(p[1] << 16) to i32) define i32 @load_i32_by_sext_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_sext_i16: -; CHECK: ldr w0, [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp2 = zext i16 %tmp1 to i32 @@ -342,9 +354,10 @@ ; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: -; CHECK: add x8, x0, w1, uxtw -; CHECK-NEXT: ldr w0, [x8, #12] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, uxtw +; CHECK-NEXT: ldr w0, [x8, #12] +; CHECK-NEXT: ret %tmp = add nuw nsw i32 %i, 3 %tmp2 = add nuw nsw i32 %i, 2 %tmp3 = add nuw nsw i32 %i, 1 @@ -379,9 +392,10 @@ ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK: add x8, x0, w1, uxtw -; CHECK-NEXT: ldur w0, [x8, #13] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, uxtw +; CHECK-NEXT: ldur w0, [x8, #13] +; CHECK-NEXT: ret %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 %tmp3 = add nuw nsw i32 %i, 2 @@ -416,11 +430,9 @@ ; (i32) p[0] | ((i32) p[1] << 8) define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: -; CHECK: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w0, [x0] +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -438,11 +450,12 @@ ; ((i32) p[0] << 8) | ((i32) p[1] << 16) define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_8: -; CHECK: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -461,11 +474,12 @@ ; ((i32) p[0] << 16) | ((i32) p[1] << 24) define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_16: -; CHECK: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -483,11 +497,11 @@ ; (i32) p[1] | ((i32) p[0] << 8) define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: mov w0, w8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: rev w0, w8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -505,11 +519,12 @@ ; ((i32) p[1] << 8) | ((i32) p[0] << 16) define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #8 -; CHECK-NEXT: bfi w0, w9, #16, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #8 +; CHECK-NEXT: bfi w0, w9, #16, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -528,11 +543,12 @@ ; ((i32) p[1] << 16) | ((i32) p[0] << 24) define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: -; CHECK: ldrb w8, [x0, #1] -; CHECK-NEXT: ldrb w9, [x0] -; CHECK-NEXT: lsl w0, w8, #16 -; CHECK-NEXT: bfi w0, w9, #24, #8 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: lsl w0, w8, #16 +; CHECK-NEXT: bfi w0, w9, #24, #8 +; CHECK-NEXT: ret %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=armeb-unknown | FileCheck %s ; RUN: llc < %s -mtriple=armv6eb-unknown | FileCheck %s --check-prefix=CHECK-ARMv6 @@ -5,12 +6,15 @@ ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i8_big_endian(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_big_endian: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_big_endian: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = load i8, i8* %tmp, align 4 %tmp2 = zext i8 %tmp1 to i32 @@ -37,18 +41,23 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) { ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: -; CHECK: ldr r0, [r0] -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 %tmp2 = load i8, i8* %tmp1, align 4 @@ -75,12 +84,15 @@ ; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) define i32 @load_i32_by_i16_by_i8_big_endian(i32* %arg) { ; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i16_by_i8_big_endian: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = load i8, i8* %tmp, align 4 %tmp2 = zext i8 %tmp1 to i16 @@ -108,12 +120,15 @@ ; ((i32) p[0] << 16) | (i32) p[1] define i32 @load_i32_by_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_i16: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i16: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp2 = zext i16 %tmp1 to i32 @@ -130,12 +145,15 @@ ; (i32) (p_16[0] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i16_i8(i32* %arg) { ; CHECK-LABEL: load_i32_by_i16_i8: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i16_i8: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* %tmp1 = bitcast i32* %arg to i8* %tmp2 = load i16, i16* %tmp, align 4 @@ -157,25 +175,32 @@ ; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) define i64 @load_i64_by_i8_bswap(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: -; CHECK: ldr{{.*}}r0 -; CHECK: ldr{{.*}}r0 -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: mov r12, #65280 +; CHECK-NEXT: ldr r0, [r0, #4] +; CHECK-NEXT: mov lr, #16711680 +; CHECK-NEXT: and r3, r12, r0, lsr #8 +; CHECK-NEXT: and r2, lr, r0, lsl #8 +; CHECK-NEXT: orr r3, r3, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: and r2, r12, r1, lsr #8 +; CHECK-NEXT: orr r0, r0, r3 +; CHECK-NEXT: and r3, lr, r1, lsl #8 +; CHECK-NEXT: orr r2, r2, r1, lsr #24 +; CHECK-NEXT: orr r1, r3, r1, lsl #24 +; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: -; CHECK-ARMv6: ldrd r2, r3, [r0] -; CHECK-ARMv6: rev r0, r3 -; CHECK-ARMv6: rev r1, r2 -; CHECK-ARMv6: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrd r2, r3, [r0] +; CHECK-ARMv6-NEXT: rev r0, r3 +; CHECK-ARMv6-NEXT: rev r1, r2 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -221,14 +246,17 @@ ; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] define i64 @load_i64_by_i8(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8: -; CHECK: ldr r2, [r0] -; CHECK: ldr r1, [r0, #4] -; CHECK: mov r0, r2 -; CHECK: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i64_by_i8: -; CHECK-ARMv6: ldrd r0, r1, [r0] -; CHECK-ARMv6: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrd r0, r1, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -274,20 +302,23 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK-ARMv6: ldr r0, [r0, #1] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -315,20 +346,23 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: -; CHECK-ARMv6: ldr r0, [r0, #-4] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 @@ -356,12 +390,15 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldr r0, [r0, #1] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #1] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK-ARMv6: ldr r0, [r0, #1] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 @@ -389,12 +426,15 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldr r0, [r0, #-4] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #-4] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK-ARMv6: ldr r0, [r0, #-4] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 @@ -424,20 +464,23 @@ ; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16) define i32 @load_i32_by_bswap_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 @@ -456,12 +499,14 @@ ; (i32) p[1] | (sext(p[0] << 16) to i32) define i32 @load_i32_by_sext_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_sext_i16: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_sext_i16: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp2 = sext i16 %tmp1 to i32 @@ -478,22 +523,24 @@ ; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: -; CHECK: add r0, r0, r1 -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: -; CHECK-ARMv6: add r0, r0, r1 -; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr %tmp = add nuw nsw i32 %i, 3 %tmp2 = add nuw nsw i32 %i, 2 %tmp3 = add nuw nsw i32 %i, 1 @@ -528,22 +575,24 @@ ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK: add r0, r1, r0 -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: ldr r0, [r0, #13] -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r1, r0 +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: ldr r0, [r0, #13] +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK-ARMv6: add r0, r1, r0 -; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: add r0, r1, r0 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 @@ -579,16 +628,21 @@ ; (i32) p[0] | ((i32) p[1] << 8) define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: mov r1, #255 +; CHECK-NEXT: orr r1, r1, #65280 +; CHECK-NEXT: and r1, r1, r0, lsl #8 +; CHECK-NEXT: lsl r0, r0, #16 +; CHECK-NEXT: orr r0, r1, r0, lsr #24 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -606,18 +660,20 @@ ; ((i32) p[0] << 8) | ((i32) p[1] << 16) define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_8: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r0, r0, #16 -; CHECK-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r0, r0, #16 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -636,18 +692,20 @@ ; ((i32) p[0] << 16) | ((i32) p[1] << 24) define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_16: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r1, lsl #16 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r0, r0, #24 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #24 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -666,16 +724,14 @@ ; (i32) p[1] | ((i32) p[0] << 8) define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -693,18 +749,20 @@ ; ((i32) p[1] << 8) | ((i32) p[0] << 16) define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r1, r1, #16 -; CHECK-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r1, r1, #16 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -723,18 +781,20 @@ ; ((i32) p[1] << 16) | ((i32) p[0] << 24) define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r0, r1, r0, lsl #16 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #24 +; CHECK-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r1, r1, #24 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #24 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -757,16 +817,18 @@ ; because in the original DAG we don't have p[1] address available define i16 @load_i16_from_nonzero_offset(i8* %p) { ; CHECK-LABEL: load_i16_from_nonzero_offset: -; CHECK: ldrh r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #2] -; CHECK-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #2] +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i16_from_nonzero_offset: -; CHECK-ARMv6: ldrh r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #2] -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrh r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #2] +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr %p1.i16 = bitcast i8* %p to i16* %p2.i8 = getelementptr i8, i8* %p, i64 2 diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-unknown | FileCheck %s ; RUN: llc < %s -mtriple=armv6-unknown | FileCheck %s --check-prefix=CHECK-ARMv6 @@ -5,20 +6,27 @@ ; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) define i32 @load_i32_by_i8_unaligned(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_unaligned: -; CHECK: ldrb{{.*}}r0 -; CHECK: ldrb{{.*}}r0 -; CHECK: ldrb{{.*}}r0 -; CHECK: ldrb{{.*}}r0 -; CHECK: orr -; CHECK: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: ldrb r0, [r0, #3] +; CHECK-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_unaligned: -; CHECK-ARMv6: ldrb{{.*}}r0 -; CHECK-ARMv6: ldrb{{.*}}r0 -; CHECK-ARMv6: ldrb{{.*}}r0 -; CHECK-ARMv6: ldrb{{.*}}r0 -; CHECK-ARMv6: orr -; CHECK-ARMv6: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #1] +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #2] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #3] +; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 %tmp2 = load i8, i8* %tmp1, align 1 @@ -45,12 +53,15 @@ ; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) define i32 @load_i32_by_i8_aligned(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_aligned: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_aligned: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 %tmp2 = load i8, i8* %tmp1, align 4 @@ -78,18 +89,23 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) { ; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: -; CHECK: ldr r0, [r0] -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = load i8, i8* %tmp, align 4 %tmp2 = zext i8 %tmp1 to i32 @@ -115,14 +131,17 @@ ; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) define i64 @load_i64_by_i8(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8: -; CHECK: ldr r2, [r0] -; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i64_by_i8: -; CHECK-ARMv6: ldrd r0, r1, [r0] -; CHECK-ARMv6: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrd r0, r1, [r0] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -168,25 +187,32 @@ ; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] define i64 @load_i64_by_i8_bswap(i64* %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: -; CHECK: ldr{{.*}}r0 -; CHECK: ldr{{.*}}r0 -; CHECK: and -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: and -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK-NEXT: orr -; CHECK: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: mov r12, #65280 +; CHECK-NEXT: ldr r0, [r0, #4] +; CHECK-NEXT: mov lr, #16711680 +; CHECK-NEXT: and r3, r12, r0, lsr #8 +; CHECK-NEXT: and r2, lr, r0, lsl #8 +; CHECK-NEXT: orr r3, r3, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: and r2, r12, r1, lsr #8 +; CHECK-NEXT: orr r0, r0, r3 +; CHECK-NEXT: and r3, lr, r1, lsl #8 +; CHECK-NEXT: orr r2, r2, r1, lsr #24 +; CHECK-NEXT: orr r1, r3, r1, lsl #24 +; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: -; CHECK-ARMv6: ldrd r2, r3, [r0] -; CHECK-ARMv6: rev r0, r3 -; CHECK-ARMv6: rev r1, r2 -; CHECK-ARMv6: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrd r2, r3, [r0] +; CHECK-ARMv6-NEXT: rev r0, r3 +; CHECK-ARMv6-NEXT: rev r1, r2 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i64* %arg to i8* %tmp1 = load i8, i8* %tmp, align 8 %tmp2 = zext i8 %tmp1 to i64 @@ -232,12 +258,15 @@ ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK: ldr r0, [r0, #1] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #1] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: -; CHECK-ARMv6: ldr r0, [r0, #1] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -265,12 +294,15 @@ ; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) define i32 @load_i32_by_i8_neg_offset(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: -; CHECK: ldr r0, [r0, #-4] -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #-4] +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: -; CHECK-ARMv6: ldr r0, [r0, #-4] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 @@ -298,20 +330,23 @@ ; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK: ldr r0, [r0, #1] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #1] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: -; CHECK-ARMv6: ldr r0, [r0, #1] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 @@ -339,20 +374,23 @@ ; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK: ldr r0, [r0, #-4] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0, #-4] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: -; CHECK-ARMv6: ldr r0, [r0, #-4] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 @@ -382,20 +420,23 @@ ; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16) define i32 @load_i32_by_bswap_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov r1, #65280 -; CHECK-NEXT: mov r2, #16711680 -; CHECK-NEXT: and r1, r1, r0, lsr #8 -; CHECK-NEXT: and r2, r2, r0, lsl #8 -; CHECK-NEXT: orr r1, r1, r0, lsr #24 -; CHECK-NEXT: orr r0, r2, r0, lsl #24 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: mov pc, lr - +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov r1, #65280 +; CHECK-NEXT: mov r2, #16711680 +; CHECK-NEXT: and r1, r1, r0, lsr #8 +; CHECK-NEXT: and r2, r2, r0, lsl #8 +; CHECK-NEXT: orr r1, r1, r0, lsr #24 +; CHECK-NEXT: orr r0, r2, r0, lsl #24 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: rev r0, r0 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr + %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 @@ -414,12 +455,14 @@ ; (i32) p[0] | (sext(p[1] << 16) to i32) define i32 @load_i32_by_sext_i16(i32* %arg) { ; CHECK-LABEL: load_i32_by_sext_i16: -; CHECK: ldr r0, [r0] -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_sext_i16: -; CHECK-ARMv6: ldr r0, [r0] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldr r0, [r0] +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i16* %tmp1 = load i16, i16* %tmp, align 4 %tmp2 = zext i16 %tmp1 to i32 @@ -436,14 +479,16 @@ ; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24) define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: -; CHECK: add r0, r0, r1 -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: -; CHECK-ARMv6: add r0, r0, r1 -; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] +; CHECK-ARMv6-NEXT: bx lr %tmp = add nuw nsw i32 %i, 3 %tmp2 = add nuw nsw i32 %i, 2 @@ -479,14 +524,16 @@ ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24) define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK: add r0, r1, r0 -; CHECK-NEXT: ldr r0, [r0, #13] -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r1, r0 +; CHECK-NEXT: ldr r0, [r0, #13] +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: -; CHECK-ARMv6: add r0, r1, r0 -; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: add r0, r1, r0 +; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] +; CHECK-ARMv6-NEXT: bx lr %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 %tmp3 = add nuw nsw i32 %i, 2 @@ -521,16 +568,14 @@ ; (i32) p[0] | ((i32) p[1] << 8) define i32 @zext_load_i32_by_i8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -548,18 +593,20 @@ ; ((i32) p[0] << 8) | ((i32) p[1] << 16) define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_8: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r0, r0, #16 -; CHECK-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r0, r0, #16 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -578,18 +625,20 @@ ; ((i32) p[0] << 16) | ((i32) p[1] << 24) define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_shl_16: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r0, r0, #24 -; CHECK-NEXT: orr r0, r0, r1, lsl #16 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r0, r0, #24 +; CHECK-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r0, r0, #24 -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r0, r0, #24 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -608,16 +657,21 @@ ; (i32) p[1] | ((i32) p[0] << 8) define i32 @zext_load_i32_by_i8_bswap(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: mov r1, #255 +; CHECK-NEXT: orr r1, r1, #65280 +; CHECK-NEXT: and r1, r1, r0, lsl #8 +; CHECK-NEXT: lsl r0, r0, #16 +; CHECK-NEXT: orr r0, r1, r0, lsr #24 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: lsl r0, r0, #16 +; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -635,18 +689,20 @@ ; ((i32) p[1] << 8) | ((i32) p[0] << 16) define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r1, r1, #16 -; CHECK-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r1, r1, #16 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 @@ -665,18 +721,20 @@ ; ((i32) p[1] << 16) | ((i32) p[0] << 24) define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) { ; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16: -; CHECK: ldrb r1, [r0] -; CHECK-NEXT: ldrb r0, [r0, #1] -; CHECK-NEXT: lsl r1, r1, #24 -; CHECK-NEXT: orr r0, r1, r0, lsl #16 -; CHECK-NEXT: mov pc, lr +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: ldrb r0, [r0, #1] +; CHECK-NEXT: lsl r1, r1, #24 +; CHECK-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16: -; CHECK-ARMv6: ldrb r1, [r0] -; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] -; CHECK-ARMv6-NEXT: lsl r1, r1, #24 -; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16 -; CHECK-ARMv6-NEXT: bx lr +; CHECK-ARMv6: @ %bb.0: +; CHECK-ARMv6-NEXT: ldrb r1, [r0] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #1] +; CHECK-ARMv6-NEXT: lsl r1, r1, #24 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16 +; CHECK-ARMv6-NEXT: bx lr %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -1119,18 +1119,12 @@ ; CHECK-LABEL: zext_load_i32_by_i8: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl (%eax), %ecx -; CHECK-NEXT: movzbl 1(%eax), %eax -; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movzwl (%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: zext_load_i32_by_i8: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzbl (%rdi), %ecx -; CHECK64-NEXT: movzbl 1(%rdi), %eax -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: movzwl (%rdi), %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0 @@ -1218,18 +1212,16 @@ ; CHECK-LABEL: zext_load_i32_by_i8_bswap: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl 1(%eax), %ecx -; CHECK-NEXT: movzbl (%eax), %eax -; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: movzwl (%eax), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: bswapl %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: zext_load_i32_by_i8_bswap: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzbl 1(%rdi), %ecx -; CHECK64-NEXT: movzbl (%rdi), %eax -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: movzwl (%rdi), %eax +; CHECK64-NEXT: shll $16, %eax +; CHECK64-NEXT: bswapl %eax ; CHECK64-NEXT: retq %tmp = bitcast i32* %arg to i8* %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1