Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -375,6 +375,7 @@ unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); + SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -3969,6 +3970,9 @@ if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); + if (SDValue Load = MatchLoadCombine(N)) + return Load; + // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) @@ -4340,6 +4344,278 @@ }; } // namespace +namespace { +/// Represents the origin of an individual byte in load combine pattern. The +/// value of the byte is either unknown, zero or comes from memory. +struct ByteProvider { + enum ProviderTy { + Unknown, ZeroConstant, Memory + }; + + ProviderTy Kind; + // Load and ByteOffset are set for Memory providers only. + // Load represents the node which loads the byte from memory. + // ByteOffset is the offset of the byte in the value produced by the load. + LoadSDNode *Load; + unsigned ByteOffset; + + ByteProvider() : Kind(ProviderTy::Unknown), Load(nullptr), ByteOffset(0) {} + + static ByteProvider getUnknown() { + return ByteProvider(ProviderTy::Unknown, nullptr, 0); + } + static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { + return ByteProvider(ProviderTy::Memory, Load, ByteOffset); + } + static ByteProvider getZero() { + return ByteProvider(ProviderTy::ZeroConstant, nullptr, 0); + } + + bool operator==(const ByteProvider &Other) const { + return Other.Kind == Kind && + Other.Load == Load && + Other.ByteOffset == ByteOffset; + } + +private: + ByteProvider(ProviderTy Kind, LoadSDNode *Load, unsigned ByteOffset) : + Kind(Kind), Load(Load), ByteOffset(ByteOffset) {} +}; + +/// Recursively traverses the expression collecting the origin of individual +/// bytes of the given value. For all the values except the root of the +/// expression verifies that it doesn't have uses outside of the expression. +const Optional> +collectByteProviders(SDValue Op, bool CheckNumberOfUses = false) { + if (CheckNumberOfUses && !Op.hasOneUse()) + return None; + + unsigned BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8 != 0) + return None; + unsigned ByteWidth = Op.getScalarValueSizeInBits() / 8; + + switch (Op.getOpcode()) { + case ISD::OR: { + auto &LHS = collectByteProviders(Op->getOperand(0), + /*CheckNumberOfUses=*/true); + auto &RHS = collectByteProviders(Op->getOperand(1), + /*CheckNumberOfUses=*/true); + if (!LHS || !RHS) + return None; + + auto OR = [] (ByteProvider LHS, ByteProvider RHS) { + if (LHS == RHS) + return LHS; + if (LHS.Kind == ByteProvider::Unknown || + RHS.Kind == ByteProvider::Unknown) + return ByteProvider::getUnknown(); + if (LHS.Kind == ByteProvider::Memory && + RHS.Kind == ByteProvider::Memory) + return ByteProvider::getUnknown(); + + if (LHS.Kind == ByteProvider::Memory) + return LHS; + else + return RHS; + }; + + SmallVector Result(ByteWidth); + for (unsigned i = 0; i < LHS->size(); i++) + Result[i] = OR(LHS.getValue()[i], RHS.getValue()[i]); + + return Result; + } + case ISD::SHL: { + ConstantSDNode *ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return None; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return None; + uint64_t ByteShift = BitShift / 8; + + auto &Original = collectByteProviders(Op->getOperand(0), + /*CheckNumberOfUses=*/true); + if (!Original) + return None; + + SmallVector Result; + Result.insert(Result.begin(), ByteShift, ByteProvider::getZero()); + Result.insert(Result.end(), Original->begin(), + std::prev(Original->end(), ByteShift)); + assert(Result.size() == ByteWidth && "sanity"); + return Result; + } + case ISD::ZERO_EXTEND: { + auto &Original = collectByteProviders(Op->getOperand(0), + /*CheckNumberOfUses=*/true); + if (!Original) + return None; + + SmallVector Result; + unsigned NarrowByteWidth = Original->size(); + Result.insert(Result.begin(), Original->begin(), Original->end()); + Result.insert(Result.end(), ByteWidth - NarrowByteWidth, + ByteProvider::getZero()); + assert(Result.size() == ByteWidth && "sanity"); + return Result; + } + case ISD::LOAD: { + LoadSDNode *L = cast(Op.getNode()); + if (L->isVolatile() || L->isIndexed() || + L->getExtensionType() != ISD::NON_EXTLOAD) + return None; + + EVT VT = L->getMemoryVT(); + assert(BitWidth == VT.getSizeInBits() && "sanity"); + + SmallVector Result(ByteWidth); + for (unsigned i = 0; i < ByteWidth; i++) + Result[i] = ByteProvider::getMemory(L, i); + + return Result; + } + } + + return None; +} +} // namespace + +/// Match a pattern where a wide type scalar value is loaded by several narrow +/// loads and combined by shifts and ors. Fold it into a single load or a load +/// and a BSWAP if the targets supports it. +/// +/// Assuming little endian target: +/// i8 *a = ... +/// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) +/// => +/// i32 val = *((i32)a) +/// +/// i8 *a = ... +/// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] +/// => +/// i32 val = BSWAP(*((i32)a)) +SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { + assert(N->getOpcode() == ISD::OR && + "Can only match load combining against OR nodes"); + + // Handles simple types only + EVT VT = N->getValueType(0); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // There is nothing to do here if the target can't load a value of this type + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + // Calculate byte providers for the OR we are looking at + auto Res = collectByteProviders(SDValue(N, 0)); + if (!Res) + return SDValue(); + auto &Bytes = Res.getValue(); + unsigned ByteWidth = Bytes.size(); + assert(VT.getSizeInBits() == ByteWidth * 8 && "sanity"); + + auto LittleEndianByteAt = [] (unsigned BW, unsigned i) { return i; }; + auto BigEndianByteAt = [] (unsigned BW, unsigned i) { return BW - i - 1; }; + + Optional Base; + SDValue Chain; + + SmallSet Loads; + LoadSDNode *FirstLoad = nullptr; + + // Check if all the bytes of the OR we are looking at are loaded from the same + // base address. Collect bytes offsets from Base address in ByteOffsets. + SmallVector ByteOffsets(ByteWidth); + for (unsigned i = 0; i < ByteWidth; i++) { + // All the bytes must be loaded from memory + if (Bytes[i].Kind != ByteProvider::Memory) + return SDValue(); + + LoadSDNode *L = Bytes[i].Load; + assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && + (L->getExtensionType() == ISD::NON_EXTLOAD) && + "Must be enforced by collectByteProviders"); + assert(L->getOffset().isUndef() && + "Unindexed load must have undef offset"); + + // All loads must share the same chain + SDValue LChain = L->getChain(); + if (!Chain) + Chain = LChain; + if (Chain != LChain) + return SDValue(); + + // Loads must share the same base address + BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); + if (!Base) + Base = Ptr; + if (!Base->equalBaseIndex(Ptr)) + return SDValue(); + + // Calculate the offset of the current byte from the base address + unsigned LoadByteWidth = L->getMemoryVT().getSizeInBits() / 8; + int64_t MemoryByteOffset = DAG.getDataLayout().isBigEndian() + ? BigEndianByteAt(LoadByteWidth, Bytes[i].ByteOffset) + : LittleEndianByteAt(LoadByteWidth, Bytes[i].ByteOffset); + int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset; + ByteOffsets[i] = ByteOffsetFromBase; + + // Remember the first byte load + if (ByteOffsetFromBase == 0) + FirstLoad = L; + + Loads.insert(L); + } + assert(Base && "must be set"); + + // Check if the bytes of the OR we are looking at match with either big or + // little endian value load + bool BigEndian = true, LittleEndian = true; + for (unsigned i = 0; i < ByteWidth; i++) { + LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i); + BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i); + if (!BigEndian && !LittleEndian) + return SDValue(); + } + assert((BigEndian != LittleEndian) && "should be either or"); + assert(FirstLoad && "must be set"); + + // The node we are looking at matches with the pattern, check if we can + // replace it with a single load and bswap if needed. + + // If the load needs byte swap check if the target supports it + bool NeedsBswap = DAG.getDataLayout().isBigEndian() != BigEndian; + if (NeedsBswap && !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // Check that a load of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + VT, FirstLoad->getAddressSpace(), + FirstLoad->getAlignment(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + + SDValue NewLoad = DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), + FirstLoad->getAlignment()); + + // Transfer chain users from old loads to the new load. + for (LoadSDNode *L : Loads) + DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), + SDValue(NewLoad.getNode(), 1)); + + if (NeedsBswap) + return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad); + else + return NewLoad; +} + SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); Index: test/CodeGen/ARM/load-combine-big-endian.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/load-combine-big-endian.ll @@ -0,0 +1,234 @@ +; RUN: llc < %s -mtriple=armeb-unknown | FileCheck %s +; RUN: llc < %s -mtriple=arm64eb-unknown | FileCheck %s --check-prefix=CHECK64 + +; i8* p; // p is 4 byte aligned +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_big_endian(i32*) { +; CHECK-LABEL: load_i32_by_i8_big_endian: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i8_big_endian: +; CHECK64: ldr w0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 4 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ret i32 %19 +} + +; i8* p; // p is 4 byte aligned +; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) +define i32 @load_i32_by_i16_by_i8_big_endian(i32*) { +; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i16_by_i8_big_endian: +; CHECK64: ldr w0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 4 + %4 = zext i8 %3 to i16 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i16 + %8 = shl nuw nsw i16 %4, 8 + %9 = or i16 %8, %7 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i16 + %13 = getelementptr inbounds i8, i8* %2, i32 3 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i16 + %16 = shl nuw nsw i16 %12, 8 + %17 = or i16 %16, %15 + %18 = zext i16 %9 to i32 + %19 = zext i16 %17 to i32 + %20 = shl nuw nsw i32 %18, 16 + %21 = or i32 %20, %19 + ret i32 %21 +} + +; i16* p; // p is 4 byte aligned +; ((i32) p[0] << 16) | (i32) p[1] +define i32 @load_i32_by_i16(i32*) { +; CHECK-LABEL: load_i32_by_i16: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i16: +; CHECK64: ldr w0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i16* + %3 = load i16, i16* %2, align 4 + %4 = zext i16 %3 to i32 + %5 = getelementptr inbounds i16, i16* %2, i32 1 + %6 = load i16, i16* %5, align 1 + %7 = zext i16 %6 to i32 + %8 = shl nuw nsw i32 %4, 16 + %9 = or i32 %8, %7 + ret i32 %9 +} + +; i16* p_16; // p_16 is 4 byte aligned +; i8* p_8 = (i8*) p_16; +; (i32) (p_16[0] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i16_i8(i32*) { +; CHECK-LABEL: load_i32_by_i16_i8: +; CHECK: ldr r0, [r0] +; CHECK-NEXT: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i16_i8: +; CHECK64: ldr w0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i16* + %3 = bitcast i32* %0 to i8* + %4 = load i16, i16* %2, align 4 + %5 = zext i16 %4 to i32 + %6 = shl nuw nsw i32 %5, 16 + %7 = getelementptr inbounds i8, i8* %3, i32 2 + %8 = load i8, i8* %7, align 1 + %9 = zext i8 %8 to i32 + %10 = shl nuw nsw i32 %9, 8 + %11 = getelementptr inbounds i8, i8* %3, i32 3 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = or i32 %10, %13 + %15 = or i32 %14, %6 + ret i32 %15 +} + +; i8* p; // p is 8 byte aligned +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8_bswap(i64*) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i64_by_i8_bswap: +; CHECK64: ldr x8, [x0] +; CHECK64-NEXT: rev x0, x8 +; CHECK64-NEXT: ret + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 8 + %4 = zext i8 %3 to i64 + %5 = getelementptr inbounds i8, i8* %2, i64 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i64 + %8 = shl nuw nsw i64 %7, 8 + %9 = or i64 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i64 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i64 + %13 = shl nuw nsw i64 %12, 16 + %14 = or i64 %9, %13 + %15 = getelementptr inbounds i8, i8* %2, i64 3 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i64 + %18 = shl nuw nsw i64 %17, 24 + %19 = or i64 %14, %18 + %20 = getelementptr inbounds i8, i8* %2, i64 4 + %21 = load i8, i8* %20, align 1 + %22 = zext i8 %21 to i64 + %23 = shl nuw nsw i64 %22, 32 + %24 = or i64 %19, %23 + %25 = getelementptr inbounds i8, i8* %2, i64 5 + %26 = load i8, i8* %25, align 1 + %27 = zext i8 %26 to i64 + %28 = shl nuw nsw i64 %27, 40 + %29 = or i64 %24, %28 + %30 = getelementptr inbounds i8, i8* %2, i64 6 + %31 = load i8, i8* %30, align 1 + %32 = zext i8 %31 to i64 + %33 = shl nuw nsw i64 %32, 48 + %34 = or i64 %29, %33 + %35 = getelementptr inbounds i8, i8* %2, i64 7 + %36 = load i8, i8* %35, align 1 + %37 = zext i8 %36 to i64 + %38 = shl nuw i64 %37, 56 + %39 = or i64 %34, %38 + ret i64 %39 +} + +; i8* p; // p is 8 byte aligned +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8(i64*) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i64_by_i8: +; CHECK64: ldr x0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 8 + %4 = zext i8 %3 to i64 + %5 = shl nuw i64 %4, 56 + %6 = getelementptr inbounds i8, i8* %2, i64 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i64 + %9 = shl nuw nsw i64 %8, 48 + %10 = or i64 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i64 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i64 + %14 = shl nuw nsw i64 %13, 40 + %15 = or i64 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i64 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i64 + %19 = shl nuw nsw i64 %18, 32 + %20 = or i64 %15, %19 + %21 = getelementptr inbounds i8, i8* %2, i64 4 + %22 = load i8, i8* %21, align 1 + %23 = zext i8 %22 to i64 + %24 = shl nuw nsw i64 %23, 24 + %25 = or i64 %20, %24 + %26 = getelementptr inbounds i8, i8* %2, i64 5 + %27 = load i8, i8* %26, align 1 + %28 = zext i8 %27 to i64 + %29 = shl nuw nsw i64 %28, 16 + %30 = or i64 %25, %29 + %31 = getelementptr inbounds i8, i8* %2, i64 6 + %32 = load i8, i8* %31, align 1 + %33 = zext i8 %32 to i64 + %34 = shl nuw nsw i64 %33, 8 + %35 = or i64 %30, %34 + %36 = getelementptr inbounds i8, i8* %2, i64 7 + %37 = load i8, i8* %36, align 1 + %38 = zext i8 %37 to i64 + %39 = or i64 %35, %38 + ret i64 %39 +} Index: test/CodeGen/ARM/load-combine.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/load-combine.ll @@ -0,0 +1,226 @@ +; RUN: llc < %s -mtriple=arm-unknown | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown | FileCheck %s --check-prefix=CHECK64 + +; i8* p; // p is 1 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_unaligned(i32*) { +; CHECK-LABEL: load_i32_by_i8_unaligned: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i8_unaligned: +; CHECK64: ldr w0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i8* + %3 = getelementptr inbounds i8, i8* %2, i32 0 + %4 = load i8, i8* %2, align 1 + %5 = zext i8 %4 to i32 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 8 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 16 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = shl nuw nsw i32 %18, 24 + %20 = or i32 %15, %19 + ret i32 %20 +} + +; i8* p; // p is 4 byte aligned +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8_aligned(i32*) { +; CHECK-LABEL: load_i32_by_i8_aligned: +; CHECK: ldr r0, [r0] +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i8_aligned: +; CHECK64: ldr w0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i8* + %3 = getelementptr inbounds i8, i8* %2, i32 0 + %4 = load i8, i8* %2, align 4 + %5 = zext i8 %4 to i32 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 8 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 16 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = shl nuw nsw i32 %18, 24 + %20 = or i32 %15, %19 + ret i32 %20 +} + +; i8* p; // p is 4 byte aligned +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap(i32*) { +; BSWAP is not supported by 32 bit target +; CHECK-LABEL: load_i32_by_i8_bswap: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i32_by_i8_bswap: +; CHECK64: ldr w8, [x0] +; CHECK64-NEXT: rev w0, w8 +; CHECK64-NEXT: ret + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 4 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ret i32 %19 +} + +; i8* p; // p is 8 byte aligned +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8(i64*) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i64_by_i8: +; CHECK64: ldr x0, [x0] +; CHECK64-NEXT: ret + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 8 + %4 = zext i8 %3 to i64 + %5 = getelementptr inbounds i8, i8* %2, i64 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i64 + %8 = shl nuw nsw i64 %7, 8 + %9 = or i64 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i64 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i64 + %13 = shl nuw nsw i64 %12, 16 + %14 = or i64 %9, %13 + %15 = getelementptr inbounds i8, i8* %2, i64 3 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i64 + %18 = shl nuw nsw i64 %17, 24 + %19 = or i64 %14, %18 + %20 = getelementptr inbounds i8, i8* %2, i64 4 + %21 = load i8, i8* %20, align 1 + %22 = zext i8 %21 to i64 + %23 = shl nuw nsw i64 %22, 32 + %24 = or i64 %19, %23 + %25 = getelementptr inbounds i8, i8* %2, i64 5 + %26 = load i8, i8* %25, align 1 + %27 = zext i8 %26 to i64 + %28 = shl nuw nsw i64 %27, 40 + %29 = or i64 %24, %28 + %30 = getelementptr inbounds i8, i8* %2, i64 6 + %31 = load i8, i8* %30, align 1 + %32 = zext i8 %31 to i64 + %33 = shl nuw nsw i64 %32, 48 + %34 = or i64 %29, %33 + %35 = getelementptr inbounds i8, i8* %2, i64 7 + %36 = load i8, i8* %35, align 1 + %37 = zext i8 %36 to i64 + %38 = shl nuw i64 %37, 56 + %39 = or i64 %34, %38 + ret i64 %39 +} + +; i8* p; // p is 8 byte aligned +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8_bswap(i64*) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: ldrb{{.*}}r0 +; CHECK: orr +; CHECK: mov pc, lr + +; CHECK64-LABEL: load_i64_by_i8_bswap: +; CHECK64: ldr x8, [x0] +; CHECK64-NEXT: rev x0, x8 +; CHECK64-NEXT: ret + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 8 + %4 = zext i8 %3 to i64 + %5 = shl nuw i64 %4, 56 + %6 = getelementptr inbounds i8, i8* %2, i64 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i64 + %9 = shl nuw nsw i64 %8, 48 + %10 = or i64 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i64 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i64 + %14 = shl nuw nsw i64 %13, 40 + %15 = or i64 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i64 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i64 + %19 = shl nuw nsw i64 %18, 32 + %20 = or i64 %15, %19 + %21 = getelementptr inbounds i8, i8* %2, i64 4 + %22 = load i8, i8* %21, align 1 + %23 = zext i8 %22 to i64 + %24 = shl nuw nsw i64 %23, 24 + %25 = or i64 %20, %24 + %26 = getelementptr inbounds i8, i8* %2, i64 5 + %27 = load i8, i8* %26, align 1 + %28 = zext i8 %27 to i64 + %29 = shl nuw nsw i64 %28, 16 + %30 = or i64 %25, %29 + %31 = getelementptr inbounds i8, i8* %2, i64 6 + %32 = load i8, i8* %31, align 1 + %33 = zext i8 %32 to i64 + %34 = shl nuw nsw i64 %33, 8 + %35 = or i64 %30, %34 + %36 = getelementptr inbounds i8, i8* %2, i64 7 + %37 = load i8, i8* %36, align 1 + %38 = zext i8 %37 to i64 + %39 = or i64 %35, %38 + ret i64 %39 +} Index: test/CodeGen/X86/load-combine.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/load-combine.ll @@ -0,0 +1,734 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64 + +; i8* p; +; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i8(i32*) { +; CHECK-LABEL: load_i32_by_i8: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i32 + %8 = shl nuw nsw i32 %7, 8 + %9 = or i32 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i32 + %13 = shl nuw nsw i32 %12, 16 + %14 = or i32 %9, %13 + %15 = getelementptr inbounds i8, i8* %2, i32 3 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i32 + %18 = shl nuw nsw i32 %17, 24 + %19 = or i32 %14, %18 + ret i32 %19 +} + +; i8* p; +; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap(i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ret i32 %19 +} + +; i16* p; +; (i32) p[0] | ((i32) p[1] << 16) +define i32 @load_i32_by_i16(i32*) { +; CHECK-LABEL: load_i32_by_i16: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i16* + %3 = load i16, i16* %2, align 1 + %4 = zext i16 %3 to i32 + %5 = getelementptr inbounds i16, i16* %2, i32 1 + %6 = load i16, i16* %5, align 1 + %7 = zext i16 %6 to i32 + %8 = shl nuw nsw i32 %7, 16 + %9 = or i32 %8, %4 + ret i32 %9 +} + +; i16* p_16; +; i8* p_8 = (i8*) p_16; +; (i32) p_16[0] | ((i32) p[2] << 16) | ((i32) p[3] << 24) +define i32 @load_i32_by_i16_i8(i32*) { +; CHECK-LABEL: load_i32_by_i16_i8: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i16* + %3 = bitcast i32* %0 to i8* + %4 = load i16, i16* %2, align 1 + %5 = zext i16 %4 to i32 + %6 = getelementptr inbounds i8, i8* %3, i32 2 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = getelementptr inbounds i8, i8* %3, i32 3 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i32 + %13 = shl nuw nsw i32 %12, 24 + %14 = or i32 %9, %13 + %15 = or i32 %14, %5 + ret i32 %15 +} + + +; i8* p; +; (i32) ((i16) p[0] | ((i16) p[1] << 8)) | (((i32) ((i16) p[3] | ((i16) p[4] << 8)) << 16) +define i32 @load_i32_by_i16_by_i8(i32*) { +; CHECK-LABEL: load_i32_by_i16_by_i8: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16_by_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i16 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i16 + %8 = shl nuw nsw i16 %7, 8 + %9 = or i16 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i16 + %13 = getelementptr inbounds i8, i8* %2, i32 3 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i16 + %16 = shl nuw nsw i16 %15, 8 + %17 = or i16 %16, %12 + %18 = zext i16 %9 to i32 + %19 = zext i16 %17 to i32 + %20 = shl nuw nsw i32 %19, 16 + %21 = or i32 %20, %18 + ret i32 %21 +} + +; i8* p; +; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4]) +define i32 @load_i32_by_i16_by_i8_bswap(i32*) { +; CHECK-LABEL: load_i32_by_i16_by_i8_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i16_by_i8_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i16 + %5 = getelementptr inbounds i8, i8* %2, i32 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i16 + %8 = shl nuw nsw i16 %4, 8 + %9 = or i16 %8, %7 + %10 = getelementptr inbounds i8, i8* %2, i32 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i16 + %13 = getelementptr inbounds i8, i8* %2, i32 3 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i16 + %16 = shl nuw nsw i16 %12, 8 + %17 = or i16 %16, %15 + %18 = zext i16 %9 to i32 + %19 = zext i16 %17 to i32 + %20 = shl nuw nsw i32 %18, 16 + %21 = or i32 %20, %19 + ret i32 %21 +} + +; i8* p; +; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56) +define i64 @load_i64_by_i8(i64*) { +; CHECK-LABEL: load_i64_by_i8: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: .cfi_offset %esi, -12 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .cfi_offset %edi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %eax +; CHECK-NEXT: movzbl 1(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movzbl 2(%ecx), %esi +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: movzbl 4(%ecx), %edx +; CHECK-NEXT: movzbl 5(%ecx), %esi +; CHECK-NEXT: shll $8, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movzbl 6(%ecx), %edi +; CHECK-NEXT: shll $16, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movzbl 7(%ecx), %edx +; CHECK-NEXT: shll $24, %edx +; CHECK-NEXT: orl %edi, %edx +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i64_by_i8: +; CHECK64: # BB#0: +; CHECK64-NEXT: movq (%rdi), %rax +; CHECK64-NEXT: retq + + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i64 + %5 = getelementptr inbounds i8, i8* %2, i64 1 + %6 = load i8, i8* %5, align 1 + %7 = zext i8 %6 to i64 + %8 = shl nuw nsw i64 %7, 8 + %9 = or i64 %8, %4 + %10 = getelementptr inbounds i8, i8* %2, i64 2 + %11 = load i8, i8* %10, align 1 + %12 = zext i8 %11 to i64 + %13 = shl nuw nsw i64 %12, 16 + %14 = or i64 %9, %13 + %15 = getelementptr inbounds i8, i8* %2, i64 3 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i64 + %18 = shl nuw nsw i64 %17, 24 + %19 = or i64 %14, %18 + %20 = getelementptr inbounds i8, i8* %2, i64 4 + %21 = load i8, i8* %20, align 1 + %22 = zext i8 %21 to i64 + %23 = shl nuw nsw i64 %22, 32 + %24 = or i64 %19, %23 + %25 = getelementptr inbounds i8, i8* %2, i64 5 + %26 = load i8, i8* %25, align 1 + %27 = zext i8 %26 to i64 + %28 = shl nuw nsw i64 %27, 40 + %29 = or i64 %24, %28 + %30 = getelementptr inbounds i8, i8* %2, i64 6 + %31 = load i8, i8* %30, align 1 + %32 = zext i8 %31 to i64 + %33 = shl nuw nsw i64 %32, 48 + %34 = or i64 %29, %33 + %35 = getelementptr inbounds i8, i8* %2, i64 7 + %36 = load i8, i8* %35, align 1 + %37 = zext i8 %36 to i64 + %38 = shl nuw i64 %37, 56 + %39 = or i64 %34, %38 + ret i64 %39 +} + +; i8* p; +; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7] +define i64 @load_i64_by_i8_bswap(i64*) { +; CHECK-LABEL: load_i64_by_i8_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 1(%eax), %edx +; CHECK-NEXT: shll $16, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 3(%eax), %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 4(%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 5(%eax), %esi +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %ecx, %esi +; CHECK-NEXT: movzbl 6(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: movzbl 7(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i64_by_i8_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movq (%rdi), %rax +; CHECK64-NEXT: bswapq %rax +; CHECK64-NEXT: retq + + %2 = bitcast i64* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i64 + %5 = shl nuw i64 %4, 56 + %6 = getelementptr inbounds i8, i8* %2, i64 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i64 + %9 = shl nuw nsw i64 %8, 48 + %10 = or i64 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i64 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i64 + %14 = shl nuw nsw i64 %13, 40 + %15 = or i64 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i64 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i64 + %19 = shl nuw nsw i64 %18, 32 + %20 = or i64 %15, %19 + %21 = getelementptr inbounds i8, i8* %2, i64 4 + %22 = load i8, i8* %21, align 1 + %23 = zext i8 %22 to i64 + %24 = shl nuw nsw i64 %23, 24 + %25 = or i64 %20, %24 + %26 = getelementptr inbounds i8, i8* %2, i64 5 + %27 = load i8, i8* %26, align 1 + %28 = zext i8 %27 to i64 + %29 = shl nuw nsw i64 %28, 16 + %30 = or i64 %25, %29 + %31 = getelementptr inbounds i8, i8* %2, i64 6 + %32 = load i8, i8* %31, align 1 + %33 = zext i8 %32 to i64 + %34 = shl nuw nsw i64 %33, 8 + %35 = or i64 %30, %34 + %36 = getelementptr inbounds i8, i8* %2, i64 7 + %37 = load i8, i8* %36, align 1 + %38 = zext i8 %37 to i64 + %39 = or i64 %35, %38 + ret i64 %39 +} + +; Part of the load by bytes pattern is used outside of the pattern +; i8* p; +; i32 x = (i32) p[1] +; res = ((i32) p[0] << 24) | (x << 16) | ((i32) p[2] << 8) | (i32) p[3] +; x | res +define i32 @load_i32_by_i8_bswap_uses(i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_uses: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 1(%eax), %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %ecx, %esi +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: movzbl 3(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_uses: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: movl %ecx, %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %eax, %edx +; CHECK64-NEXT: movzbl 2(%rdi), %esi +; CHECK64-NEXT: shll $8, %esi +; CHECK64-NEXT: orl %edx, %esi +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %esi, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ; Use individual part of the pattern outside of the pattern + %20 = or i32 %8, %19 + ret i32 %20 +} + +; One of the loads is volatile +; i8* p; +; p0 = volatile *p; +; ((i32) p0 << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap_volatile(i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_volatile: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: movzbl 1(%eax), %edx +; CHECK-NEXT: shll $16, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 3(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_volatile: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: shll $16, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = load volatile i8, i8* %2, align 1 + %4 = zext i8 %3 to i32 + %5 = shl nuw nsw i32 %4, 24 + %6 = getelementptr inbounds i8, i8* %2, i32 1 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 16 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 2 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 8 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 3 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = or i32 %15, %18 + ret i32 %19 +} + +; There is a store in between individual loads +; i8* p, q; +; res1 = ((i32) p[0] << 24) | ((i32) p[1] << 16) +; *q = 0; +; res2 = ((i32) p[2] << 8) | (i32) p[3] +; res1 | res2 +define i32 @load_i32_by_i8_bswap_store_in_between(i32*, i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_store_in_between: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .Ltmp9: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %edx +; CHECK-NEXT: shll $24, %edx +; CHECK-NEXT: movzbl 1(%ecx), %esi +; CHECK-NEXT: movl $0, (%eax) +; CHECK-NEXT: shll $16, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movzbl 2(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %esi, %edx +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rdi), %ecx +; CHECK64-NEXT: movl $0, (%rsi) +; CHECK64-NEXT: shll $16, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %3 = bitcast i32* %0 to i8* + %4 = load i8, i8* %3, align 1 + %5 = zext i8 %4 to i32 + %6 = shl nuw nsw i32 %5, 24 + %7 = getelementptr inbounds i8, i8* %3, i32 1 + %8 = load i8, i8* %7, align 1 + ; This store will prevent folding of the pattern + store i32 0, i32* %1 + %9 = zext i8 %8 to i32 + %10 = shl nuw nsw i32 %9, 16 + %11 = or i32 %10, %6 + %12 = getelementptr inbounds i8, i8* %3, i32 2 + %13 = load i8, i8* %12, align 1 + %14 = zext i8 %13 to i32 + %15 = shl nuw nsw i32 %14, 8 + %16 = or i32 %11, %15 + %17 = getelementptr inbounds i8, i8* %3, i32 3 + %18 = load i8, i8* %17, align 1 + %19 = zext i8 %18 to i32 + %20 = or i32 %16, %19 + ret i32 %20 +} + +; One of the loads is from an unrelated location +; i8* p, q; +; ((i32) p[0] << 24) | ((i32) q[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] +define i32 @load_i32_by_i8_bswap_unrelated_load(i32*, i32*) { +; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl (%ecx), %edx +; CHECK-NEXT: shll $24, %edx +; CHECK-NEXT: movzbl 1(%eax), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movzbl 2(%ecx), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: movzbl 1(%rsi), %ecx +; CHECK64-NEXT: shll $16, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 3(%rdi), %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %3 = bitcast i32* %0 to i8* + %4 = bitcast i32* %1 to i8* + %5 = load i8, i8* %3, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 24 + ; Load from an unrelated address + %8 = getelementptr inbounds i8, i8* %4, i32 1 + %9 = load i8, i8* %8, align 1 + %10 = zext i8 %9 to i32 + %11 = shl nuw nsw i32 %10, 16 + %12 = or i32 %11, %7 + %13 = getelementptr inbounds i8, i8* %3, i32 2 + %14 = load i8, i8* %13, align 1 + %15 = zext i8 %14 to i32 + %16 = shl nuw nsw i32 %15, 8 + %17 = or i32 %12, %16 + %18 = getelementptr inbounds i8, i8* %3, i32 3 + %19 = load i8, i8* %18, align 1 + %20 = zext i8 %19 to i32 + %21 = or i32 %17, %20 + ret i32 %21 +} + +; Non-zero offsets are not supported for now +; i8* p; +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_unsupported_offset(i32*) { +; CHECK-LABEL: load_i32_by_i8_unsupported_offset: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %ecx +; CHECK-NEXT: movzbl 2(%eax), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 3(%eax), %ecx +; CHECK-NEXT: shll $16, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 4(%eax), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_unsupported_offset: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl (%rdi), %eax +; CHECK64-NEXT: movzbl 2(%rdi), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 3(%rdi), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 4(%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %2 = bitcast i32* %0 to i8* + %3 = getelementptr inbounds i8, i8* %2, i32 1 + %4 = load i8, i8* %2, align 1 + %5 = zext i8 %4 to i32 + %6 = getelementptr inbounds i8, i8* %2, i32 2 + %7 = load i8, i8* %6, align 1 + %8 = zext i8 %7 to i32 + %9 = shl nuw nsw i32 %8, 8 + %10 = or i32 %9, %5 + %11 = getelementptr inbounds i8, i8* %2, i32 3 + %12 = load i8, i8* %11, align 1 + %13 = zext i8 %12 to i32 + %14 = shl nuw nsw i32 %13, 16 + %15 = or i32 %10, %14 + %16 = getelementptr inbounds i8, i8* %2, i32 4 + %17 = load i8, i8* %16, align 1 + %18 = zext i8 %17 to i32 + %19 = shl nuw nsw i32 %18, 24 + %20 = or i32 %15, %19 + ret i32 %20 +} + +; i8* p; i32 i; +; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3] +define i32 @load_i32_by_i8_bswap_base_index_offset(i32*, i32) { +; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%ecx,%eax), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: retl +; +; Currently we don't fold the pattern for x86-64 target because we don't see +; that the loads are adjacent. It happens because BaseIndexOffset doesn't look +; through zexts. +; +; CHECK64-LABEL: load_i32_by_i8_bswap_base_index_offset: +; CHECK64: # BB#0: +; CHECK64-NEXT: movslq %esi, %rax +; CHECK64-NEXT: movzbl (%rdi,%rax), %ecx +; CHECK64-NEXT: shll $24, %ecx +; CHECK64-NEXT: movzbl 1(%rdi,%rax), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 2(%rdi,%rax), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %edx, %ecx +; CHECK64-NEXT: movzbl 3(%rdi,%rax), %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: retq + + %3 = bitcast i32* %0 to i8* + %4 = getelementptr inbounds i8, i8* %3, i32 %1 + %5 = load i8, i8* %4, align 1 + %6 = zext i8 %5 to i32 + %7 = shl nuw nsw i32 %6, 24 + %8 = add nuw nsw i32 %1, 1 + %9 = getelementptr inbounds i8, i8* %3, i32 %8 + %10 = load i8, i8* %9, align 1 + %11 = zext i8 %10 to i32 + %12 = shl nuw nsw i32 %11, 16 + %13 = or i32 %12, %7 + %14 = add nuw nsw i32 %1, 2 + %15 = getelementptr inbounds i8, i8* %3, i32 %14 + %16 = load i8, i8* %15, align 1 + %17 = zext i8 %16 to i32 + %18 = shl nuw nsw i32 %17, 8 + %19 = or i32 %13, %18 + %20 = add nuw nsw i32 %1, 3 + %21 = getelementptr inbounds i8, i8* %3, i32 %20 + %22 = load i8, i8* %21, align 1 + %23 = zext i8 %22 to i32 + %24 = or i32 %19, %23 + ret i32 %24 +}