Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -18,6 +18,7 @@ #define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H #include "llvm/ADT/APFloat.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/Register.h" #include "llvm/Support/Alignment.h" @@ -471,6 +472,20 @@ bool applyCombineInsertVecElts(MachineInstr &MI, SmallVectorImpl &MatchInfo); + /// Match expression trees of the form + /// + /// \code + /// sN *a = ... + /// sM val = a[0] | (a[1] << N) | (a[2] << 2N) | (a[3] << 3N) ... + /// \endcode + /// + /// And check if the tree can be replaced with a M-bit load + possibly a + /// bswap. + bool matchLoadOrCombine(MachineInstr &MI, + std::function &MatchInfo); + bool applyLoadOrCombine(MachineInstr &MI, + std::function &MatchInfo); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); @@ -499,6 +514,30 @@ /// \returns true if a candidate is found. bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base, Register &Offset); + + /// Helper function for matchLoadOrCombine. Searches for Registers + /// which may have been produced by a load instruction + some arithmetic. + /// + /// \param [in] Root - The search root. + /// + /// \returns The Registers found during the search. + Optional> + findCandidatesForLoadOrCombine(const MachineInstr *Root) const; + + /// Helper function for matchLoadOrCombine. + /// + /// Checks if every register in \p RegsToVisit is defined by a load + /// instruction + some arithmetic. + /// + /// \param [out] MemOffset2Idx - Maps the byte positions each load ends up + /// at to the index of the load. + /// \param [in] MemSizeInBits - The number of bits each load should produce. + /// + /// \returns The lowest-index load found and the lowest index on success. + Optional> findLoadOffsetsForLoadOrCombine( + SmallDenseMap &MemOffset2Idx, + const SmallVector &RegsToVisit, + const unsigned MemSizeInBits); }; } // namespace llvm Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1658,6 +1658,11 @@ const MachineMemOperand &MMO, bool *Fast = nullptr) const; + /// LLT handling variant. + bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty, + const MachineMemOperand &MMO, + bool *Fast = nullptr) const; + /// Returns the target specific optimal type for load and store operations as /// a result of memset, memcpy, and memmove lowering. /// It returns EVT::Other if the type should be determined using generic Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -545,6 +545,14 @@ [{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]), (apply [{ return Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>; +def load_or_combine_matchdata : +GIDefMatchData<"std::function">; +def load_or_combine : GICombineRule< + (defs root:$root, load_or_combine_matchdata:$info), + (match (wip_match_opcode G_OR):$root, + [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]), + (apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>; + // Currently only the one combine above. def insert_vec_elt_combines : GICombineGroup< [combine_insert_vec_elts_build_vector]>; @@ -587,4 +595,4 @@ unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc, unmerge_zext_to_zext, trunc_ext_fold, trunc_shl, const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, - shift_immed_chain, shift_of_shifted_logic_chain]>; + shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -48,6 +48,66 @@ return *Builder.getMF().getSubtarget().getTargetLowering(); } +/// \returns The little endian in-memory byte position of byte \p I in a +/// \p ByteWidth bytes wide type. +/// +/// E.g. Given a 4-byte type x, x[0] -> byte 0 +static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) { + assert(I < ByteWidth && "I must be in [0, ByteWidth)"); + return I; +} + +/// \returns The big endian in-memory byte position of byte \p I in a +/// \p ByteWidth bytes wide type. +/// +/// E.g. Given a 4-byte type x, x[0] -> byte 3 +static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) { + assert(I < ByteWidth && "I must be in [0, ByteWidth)"); + return ByteWidth - I - 1; +} + +/// Given a map from byte offsets in memory to indices in a load/store, +/// determine if that map corresponds to a little or big endian byte pattern. +/// +/// \param MemOffset2Idx maps memory offsets to address offsets. +/// \param LowestIdx is the lowest index in \p MemOffset2Idx. +/// +/// \returns true if the map corresponds to a big endian byte pattern, false +/// if it corresponds to a little endian byte pattern, and None otherwise. +/// +/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns +/// are as follows: +/// +/// AddrOffset Little endian Big endian +/// 0 0 3 +/// 1 1 2 +/// 2 2 1 +/// 3 3 0 +static Optional +isBigEndian(const SmallDenseMap &MemOffset2Idx, + int64_t LowestIdx) { + // Need at least two byte positions to decide on endianness. + unsigned Width = MemOffset2Idx.size(); + if (Width < 2) + return None; + bool BigEndian = true, LittleEndian = true; + for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) { + auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset); + if (MemOffsetAndIdx == MemOffset2Idx.end()) + return None; + const int64_t Idx = MemOffsetAndIdx->second - LowestIdx; + assert(Idx >= 0 && "Expected non-negative byte offset?"); + LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset); + BigEndian &= Idx == bigEndianByteAt(Width, MemOffset); + if (!BigEndian && !LittleEndian) + return None; + } + + assert((BigEndian != LittleEndian) && + "Pattern cannot be both big and little endian!"); + return BigEndian; +} + bool CombinerHelper::isLegalOrBeforeLegalizer( const LegalityQuery &Query) const { return !LI || LI->getAction(Query).Action == LegalizeActions::Legal; @@ -564,13 +624,16 @@ assert(DefMI.getParent() == UseMI.getParent()); if (&DefMI == &UseMI) return false; - - // Loop through the basic block until we find one of the instructions. - MachineBasicBlock::const_iterator I = DefMI.getParent()->begin(); - for (; &*I != &DefMI && &*I != &UseMI; ++I) - return &*I == &DefMI; - - llvm_unreachable("Block must contain instructions"); + const MachineBasicBlock &MBB = *DefMI.getParent(); + auto NonDbgInsts = + instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); + auto DefOrUse = + find_if(NonDbgInsts, [&DefMI, &UseMI](const MachineInstr &MI) { + return &MI == &DefMI || &MI == &UseMI; + }); + if (DefOrUse == NonDbgInsts.end()) + llvm_unreachable("Block must contain both DefMI and UseMI!"); + return &*DefOrUse == &DefMI; } bool CombinerHelper::dominates(const MachineInstr &DefMI, @@ -3152,6 +3215,361 @@ return true; } +Optional> +CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const { + assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!"); + // We want to detect if Root is part of a tree which represents a bunch + // of loads being merged into a larger load. We'll try to recognize patterns + // like, for example: + // + // Reg Reg + // \ / + // OR_1 Reg + // \ / + // OR_2 + // \ Reg + // .. / + // Root + // + // Reg Reg Reg Reg + // \ / \ / + // OR_1 OR_2 + // \ / + // \ / + // ... + // Root + // + // Each "Reg" may have been produced by a load + some arithmetic. This + // function will save each of them. + SmallVector RegsToVisit; + SmallVector Ors = {Root}; + + // In the "worst" case, we're dealing with a load for each byte. So, there + // are at most #bytes - 1 ORs. + const unsigned MaxIter = + MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1; + for (unsigned Iter = 0; Iter < MaxIter; ++Iter) { + if (Ors.empty()) + break; + const MachineInstr *Curr = Ors.pop_back_val(); + Register OrLHS = Curr->getOperand(1).getReg(); + Register OrRHS = Curr->getOperand(2).getReg(); + + // In the combine, we want to elimate the entire tree. + if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS)) + return None; + + // If it's a G_OR, save it and continue to walk. If it's not, then it's + // something that may be a load + arithmetic. + if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI)) + Ors.push_back(Or); + else + RegsToVisit.push_back(OrLHS); + if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI)) + Ors.push_back(Or); + else + RegsToVisit.push_back(OrRHS); + } + + // We're going to try and merge each register into a wider power-of-2 type, + // so we ought to have an even number of registers. + if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0) + return None; + return RegsToVisit; +} + +/// Helper function for findLoadOffsetsForLoadOrCombine. +/// +/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value, +/// and then moving that value into a specific byte offset. +/// +/// e.g. x[i] << 24 +/// +/// \returns The load instruction and the byte offset it is moved into. +static Optional> +matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits, + const MachineRegisterInfo &MRI) { + assert(MRI.hasOneNonDBGUse(Reg) && + "Expected Reg to only have one non-debug use?"); + Register MaybeLoad; + int64_t Shift; + if (!mi_match(Reg, MRI, + m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) { + Shift = 0; + MaybeLoad = Reg; + } + + if (Shift % MemSizeInBits != 0) + return None; + + // TODO: Handle other types of loads. + auto *Load = getOpcodeDef(TargetOpcode::G_ZEXTLOAD, MaybeLoad, MRI); + if (!Load) + return None; + + const auto &MMO = **Load->memoperands_begin(); + if (!MMO.isUnordered() || MMO.getSizeInBits() != MemSizeInBits) + return None; + + return std::make_pair(Load, Shift / MemSizeInBits); +} + +Optional> +CombinerHelper::findLoadOffsetsForLoadOrCombine( + SmallDenseMap &MemOffset2Idx, + const SmallVector &RegsToVisit, const unsigned MemSizeInBits) { + + // Each load found for the pattern. There should be one for each RegsToVisit. + SmallSetVector Loads; + + // The lowest index used in any load. (The lowest "i" for each x[i].) + int64_t LowestIdx = INT64_MAX; + + // The load which uses the lowest index. + MachineInstr *LowestIdxLoad = nullptr; + + // Keeps track of the load indices we see. We shouldn't see any indices twice. + SmallSet SeenIdx; + + // Ensure each load is in the same MBB. + // TODO: Support multiple MachineBasicBlocks. + MachineBasicBlock *MBB = nullptr; + const MachineMemOperand *MMO = nullptr; + + // Earliest instruction-order load in the pattern. + MachineInstr *EarliestLoad = nullptr; + + // Latest instruction-order load in the pattern. + MachineInstr *LatestLoad = nullptr; + + // Base pointer which every load should share. + Register BasePtr; + + // We want to find a load for each register. Each load should have some + // appropriate bit twiddling arithmetic. During this loop, we will also keep + // track of the load which uses the lowest index. Later, we will check if we + // can use its pointer in the final, combined load. + for (auto Reg : RegsToVisit) { + // Find the load, and find the position that it will end up in (e.g. a + // shifted) value. + auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI); + if (!LoadAndPos) + return None; + MachineInstr *Load; + int64_t DstPos; + std::tie(Load, DstPos) = *LoadAndPos; + + // TODO: Handle multiple MachineBasicBlocks. Currently not handled because + // it is difficult to check for stores/calls/etc between loads. + MachineBasicBlock *LoadMBB = Load->getParent(); + if (!MBB) + MBB = LoadMBB; + if (LoadMBB != MBB) + return None; + + // Make sure that the MachineMemOperands of every seen load are compatible. + const MachineMemOperand *LoadMMO = *Load->memoperands_begin(); + if (!MMO) + MMO = LoadMMO; + if (MMO->getAddrSpace() != LoadMMO->getAddrSpace()) + return None; + + // Find out what the base pointer and index for the load is. + Register LoadPtr; + int64_t Idx; + if (!mi_match(Load->getOperand(1).getReg(), MRI, + m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) { + LoadPtr = Load->getOperand(1).getReg(); + Idx = 0; + } + + // Don't combine things like a[i], a[i] -> a bigger load. + if (!SeenIdx.insert(Idx).second) + return None; + + // Every load must share the same base pointer; don't combine things like: + // + // a[i], b[i + 1] -> a bigger load. + if (!BasePtr.isValid()) + BasePtr = LoadPtr; + if (BasePtr != LoadPtr) + return None; + + if (Idx < LowestIdx) { + LowestIdx = Idx; + LowestIdxLoad = Load; + } + + // Keep track of the byte offset that this load ends up at. If we have seen + // the byte offset, then stop here. We do not want to combine: + // + // a[i] << 16, a[i + k] << 16 -> a bigger load. + if (!MemOffset2Idx.try_emplace(DstPos, Idx).second) + return None; + Loads.insert(Load); + + // Keep track of the position of the earliest/latest loads in the pattern. + // We will check that there are no load fold barriers between them later + // on. + // + // FIXME: Is there a better way to check for load fold barriers? + if (!EarliestLoad || dominates(*Load, *EarliestLoad)) + EarliestLoad = Load; + if (!LatestLoad || dominates(*LatestLoad, *Load)) + LatestLoad = Load; + } + + // We found a load for each register. Let's check if each load satisfies the + // pattern. + assert(Loads.size() == RegsToVisit.size() && + "Expected to find a load for each register?"); + assert(EarliestLoad != LatestLoad && EarliestLoad && + LatestLoad && "Expected at least two loads?"); + + // Check if there are any stores, calls, etc. between any of the loads. If + // there are, then we can't safely perform the combine. + // + // MaxIter is chosen based off the (worst case) number of iterations it + // typically takes to succeed in the LLVM test suite plus some padding. + // + // FIXME: Is there a better way to check for load fold barriers? + const unsigned MaxIter = 20; + unsigned Iter = 0; + for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(), + LatestLoad->getIterator())) { + if (Loads.count(&MI)) + continue; + if (MI.isLoadFoldBarrier()) + return None; + if (Iter++ == MaxIter) + return None; + } + + return std::make_pair(LowestIdxLoad, LowestIdx); +} + +bool CombinerHelper::matchLoadOrCombine( + MachineInstr &MI, std::function &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_OR); + MachineFunction &MF = *MI.getMF(); + // Assuming a little-endian target, transform: + // s8 *a = ... + // s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) + // => + // s32 val = *((i32)a) + // + // s8 *a = ... + // s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] + // => + // s32 val = BSWAP(*((s32)a)) + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + if (Ty.isVector()) + return false; + + // We need to combine at least two loads into this type. Since the smallest + // possible load is into a byte, we need at least a 16-bit wide type. + const unsigned WideMemSizeInBits = Ty.getSizeInBits(); + if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0) + return false; + + // Match a collection of non-OR instructions in the pattern. + auto RegsToVisit = findCandidatesForLoadOrCombine(&MI); + if (!RegsToVisit) + return false; + + // We have a collection of non-OR instructions. Figure out how wide each of + // the small loads should be based off of the number of potential loads we + // found. + const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size(); + if (NarrowMemSizeInBits % 8 != 0) + return false; + + // Check if each register feeding into each OR is a load from the same + // base pointer + some arithmetic. + // + // e.g. a[0], a[1] << 8, a[2] << 16, etc. + // + // Also verify that each of these ends up putting a[i] into the same memory + // offset as a load into a wide type would. + SmallDenseMap MemOffset2Idx; + MachineInstr *LowestIdxLoad; + int64_t LowestIdx; + auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine( + MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits); + if (!MaybeLoadInfo) + return false; + std::tie(LowestIdxLoad, LowestIdx) = *MaybeLoadInfo; + + // We have a bunch of loads being OR'd together. Using the addresses + offsets + // we found before, check if this corresponds to a big or little endian byte + // pattern. If it does, then we can represent it using a load + possibly a + // BSWAP. + bool IsBigEndianTarget = MF.getDataLayout().isBigEndian(); + Optional IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx); + if (!IsBigEndian.hasValue()) + return false; + bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian; + if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}})) + return false; + + // Make sure that the load from the lowest index produces offset 0 in the + // final value. + // + // This ensures that we won't combine something like this: + // + // load x[i] -> byte 2 + // load x[i+1] -> byte 0 ---> wide_load x[i] + // load x[i+2] -> byte 1 + const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits; + const unsigned ZeroByteOffset = + *IsBigEndian + ? bigEndianByteAt(NumLoadsInTy, 0) + : littleEndianByteAt(NumLoadsInTy, 0); + auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset); + if (ZeroOffsetIdx == MemOffset2Idx.end() || + ZeroOffsetIdx->second != LowestIdx) + return false; + + // We wil reuse the pointer from the load which ends up at byte offset 0. It + // may not use index 0. + Register Ptr = LowestIdxLoad->getOperand(1).getReg(); + const MachineMemOperand &MMO = **LowestIdxLoad->memoperands_begin(); + LegalityQuery::MemDesc MMDesc; + MMDesc.SizeInBits = WideMemSizeInBits; + MMDesc.AlignInBits = MMO.getAlign().value() * 8; + MMDesc.Ordering = MMO.getOrdering(); + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}})) + return false; + auto PtrInfo = MMO.getPointerInfo(); + auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8); + + // Load must be allowed and fast on the target. + LLVMContext &C = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + bool Fast = false; + if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) || + !Fast) + return false; + + MatchInfo = [=](MachineIRBuilder &MIB) { + Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst; + MIB.buildLoad(LoadDst, Ptr, *NewMMO); + if (NeedsBSwap) + MIB.buildBSwap(Dst, LoadDst); + }; + return true; +} + +bool CombinerHelper::applyLoadOrCombine( + MachineInstr &MI, std::function &MatchInfo) { + Builder.setInstrAndDebugLoc(MI); + MatchInfo(Builder); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1756,6 +1756,14 @@ MMO.getFlags(), Fast); } +bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context, + const DataLayout &DL, LLT Ty, + const MachineMemOperand &MMO, + bool *Fast) const { + return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(), + MMO.getAlign(), MMO.getFlags(), Fast); +} + BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const { return BranchProbability(MinPercentageForPredictableBranch, 100); } Index: llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOT_STRICT +# RUN: llc -debugify-and-strip-all-safe -mattr=+strict-align -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=STRICT + +# REQUIRES: asserts + +# Check that the load-or combine respects alignment requirements. +... +--- +name: misaligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; NOT_STRICT-LABEL: name: misaligned + ; NOT_STRICT: liveins: $x0, $x1 + ; NOT_STRICT: %ptr:_(p0) = COPY $x1 + ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; NOT_STRICT: $w1 = COPY %full_load(s32) + ; NOT_STRICT: RET_ReallyLR implicit $w1 + ; STRICT-LABEL: name: misaligned + ; STRICT: liveins: $x0, $x1 + ; STRICT: %cst_1:_(s64) = G_CONSTANT i64 1 + ; STRICT: %cst_16:_(s32) = G_CONSTANT i32 16 + ; STRICT: %ptr:_(p0) = COPY $x1 + ; STRICT: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; STRICT: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; STRICT: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; STRICT: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; STRICT: %full_load:_(s32) = G_OR %low_half, %high_half + ; STRICT: $w1 = COPY %full_load(s32) + ; STRICT: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: aligned +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; NOT_STRICT-LABEL: name: aligned + ; NOT_STRICT: liveins: $x0, $x1 + ; NOT_STRICT: %ptr:_(p0) = COPY $x1 + ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4) + ; NOT_STRICT: $w1 = COPY %full_load(s32) + ; NOT_STRICT: RET_ReallyLR implicit $w1 + ; STRICT-LABEL: name: aligned + ; STRICT: liveins: $x0, $x1 + ; STRICT: %ptr:_(p0) = COPY $x1 + ; STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4) + ; STRICT: $w1 = COPY %full_load(s32) + ; STRICT: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 4) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 4) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 Index: llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir @@ -0,0 +1,1571 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=LITTLE +# RUN: llc -debugify-and-strip-all-safe -mtriple arm64eb -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=BIG + +# REQUIRES: asserts + +# Test that we can combine patterns like +# +# s8* x = ... +# s32 y = (x[0] | (x[1] << 8)) | ((x[2] << 16) | (x[3] << 24)) +# +# Into either a load, or a load with a bswap. + +... +--- +name: s8_loads_to_s32_little_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s8* x = ... + ; s32 y = (x[0] | (x[1] << 8)) | ((x[2] << 16) | (x[3] << 24)) + ; + ; -> Little endian: Load from x[0] + ; -> Big endian: Load from x[0] + BSWAP + + ; LITTLE-LABEL: name: s8_loads_to_s32_little_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1) + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: s8_loads_to_s32_little_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1) + ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + + %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + + %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + + ; Note the shape of the tree: + ; + ; byte byte byte byte + ; \ / \ / + ; OR OR + ; \ / + ; \ / + ; OR + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: s8_loads_to_s32_big_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s8* x = ... + ; s32 y = (x[0] << 24 | (x[1] << 16)) | ((x[2] << 8) | x[3])) + ; + ; -> Little endian: Load from x[0] + BSWAP + ; -> Big endian: Load from x[0] + + ; LITTLE-LABEL: name: s8_loads_to_s32_big_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1) + ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: s8_loads_to_s32_big_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1) + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + + %elt0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + + %byte0:_(s32) = nuw G_SHL %elt0, %cst_24(s32) + %byte1:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + %byte2:_(s32) = nuw G_SHL %elt2, %cst_8(s32) + %byte3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: different_or_pattern +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; Slightly different OR tree. + ; + ; s8* x = ... + ; s32 y = (((x[0] | (x[1] << 8)) | (x[2] << 16)) | (x[3] << 24)) + ; + ; -> Little endian: Load from x[0] + ; -> Big endian: Load from x[0] + BSWAP + + ; LITTLE-LABEL: name: different_or_pattern + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1) + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: different_or_pattern + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1) + ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + + %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + + %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + + ; Note the shape of the tree: + ; + ; byte byte + ; \ / + ; OR_1 byte + ; \ / + ; OR_2 + ; \ + ; ... + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %or1, %byte2 + %full_load:_(s32) = G_OR %or2, %byte3 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: s16_loads_to_s32_little_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s16* x = ... + ; s32 y = x[0] | (x[1] << 16) + ; + ; -> Little endian: Load from x[0] + ; -> Big endian: Load from x[0] + BSWAP + + ; LITTLE-LABEL: name: s16_loads_to_s32_little_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: s16_loads_to_s32_little_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: s16_loads_to_s32_big_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s16 *x = ... + ; s32 y = x[1] | (x[0] << 16) + ; + ; -> Little endian: Load from x[0] + BSWAP + ; -> Big endian: Load from x[0] + + ; LITTLE-LABEL: name: s16_loads_to_s32_big_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: s16_loads_to_s32_big_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %elt0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt0, %cst_16(s32) + %low_half:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: s16_loads_to_s64_little_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s16 *x = ... + ; s32 y = (x[0] | (x[1] << 16)) | ((x[2] << 32) | (x[3] << 48)) + ; + ; -> Little endian: Load from x[0] + ; -> Big endian: Load from x[0] + BSWAP + + ; LITTLE-LABEL: name: s16_loads_to_s64_little_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %full_load:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2) + ; LITTLE: $x1 = COPY %full_load(s64) + ; LITTLE: RET_ReallyLR implicit $x1 + ; BIG-LABEL: name: s16_loads_to_s64_little_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2) + ; BIG: %full_load:_(s64) = G_BSWAP [[LOAD]] + ; BIG: $x1 = COPY %full_load(s64) + ; BIG: RET_ReallyLR implicit $x1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_2:_(s64) = G_CONSTANT i64 2 + %cst_3:_(s64) = G_CONSTANT i64 3 + + %cst_16:_(s64) = G_CONSTANT i64 16 + %cst_32:_(s64) = G_CONSTANT i64 32 + %cst_48:_(s64) = G_CONSTANT i64 48 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s64) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64) + + %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2) + + %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %elt2:_(s64) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 2) + %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2) + + %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64) + %byte4_byte5:_(s64) = nuw G_SHL %elt2, %cst_32(s64) + %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64) + + %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3 + %or2:_(s64) = G_OR %byte4_byte5, %byte6_byte7 + %full_load:_(s64) = G_OR %or1, %or2 + + $x1 = COPY %full_load(s64) + RET_ReallyLR implicit $x1 + +... +--- +name: s16_loads_to_s64_big_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s16 *x = ... + ; s64 y = (x[3] | (x[2] << 16)) | ((x[1] << 32) | (x[0] << 48)) + ; + ; -> Little endian: Load from x[0] + BSWAP + ; -> Big endian: Load from x[0] + + ; LITTLE-LABEL: name: s16_loads_to_s64_big_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2) + ; LITTLE: %full_load:_(s64) = G_BSWAP [[LOAD]] + ; LITTLE: $x1 = COPY %full_load(s64) + ; LITTLE: RET_ReallyLR implicit $x1 + ; BIG-LABEL: name: s16_loads_to_s64_big_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %full_load:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2) + ; BIG: $x1 = COPY %full_load(s64) + ; BIG: RET_ReallyLR implicit $x1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_2:_(s64) = G_CONSTANT i64 2 + %cst_3:_(s64) = G_CONSTANT i64 3 + + %cst_16:_(s64) = G_CONSTANT i64 16 + %cst_32:_(s64) = G_CONSTANT i64 32 + %cst_48:_(s64) = G_CONSTANT i64 48 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s64) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64) + + %elt0:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %elt2:_(s64) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 2) + + %byte0_byte1:_(s64) = nuw G_SHL %elt0, %cst_48(s64) + %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_32(s64) + %byte4_byte5:_(s64) = nuw G_SHL %elt2, %cst_16(s64) + %byte6_byte7:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2) + + %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3 + %or2:_(s64) = G_OR %byte4_byte5, %byte6_byte7 + %full_load:_(s64) = G_OR %or1, %or2 + + $x1 = COPY %full_load(s64) + RET_ReallyLR implicit $x1 + + +... +--- +name: nonzero_start_idx_positive_little_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s8* x = ... + ; s32 y = (x[1] | (x[2] << 8)) | ((x[3] << 16) | (x[4] << 24)) + ; + ; -> Little endian: Load from x[1] + ; -> Big endian: Load from x[1] + BSWAP + + ; LITTLE-LABEL: name: nonzero_start_idx_positive_little_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1 + ; LITTLE: %ptr:_(p0) = COPY $x0 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; LITTLE: %full_load:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1) + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: nonzero_start_idx_positive_little_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1 + ; BIG: %ptr:_(p0) = COPY $x0 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1) + ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + %cst_4:_(s32) = G_CONSTANT i32 4 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x0 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + %ptr_elt_4:_(p0) = G_PTR_ADD %ptr, %cst_4(s32) + + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + %elt4:_(s32) = G_ZEXTLOAD %ptr_elt_4(p0) :: (load 1) + + %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %byte1:_(s32) = nuw G_SHL %elt2, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt3, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt4, %cst_24(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: nonzero_start_idx_positive_big_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s8* x = ... + ; s32 y = (x[4] | (x[3] << 8)) | ((x[2] << 16) | (x[1] << 24)) + ; + ; -> Little endian: Load from x[1] + BSWAP + ; -> Big endian: Load from x[1] + + ; LITTLE-LABEL: name: nonzero_start_idx_positive_big_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1 + ; LITTLE: %ptr:_(p0) = COPY $x0 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1) + ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: nonzero_start_idx_positive_big_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1 + ; BIG: %ptr:_(p0) = COPY $x0 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; BIG: %full_load:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1) + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + %cst_4:_(s32) = G_CONSTANT i32 4 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x0 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + %ptr_elt_4:_(p0) = G_PTR_ADD %ptr, %cst_4(s32) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + + %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_4(p0) :: (load 1) + %byte1:_(s32) = nuw G_SHL %elt3, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt1, %cst_24(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: nonzero_start_idx_negative_little_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s8* x = ... + ; s32 y = (x[-3] | (x[-2] << 8)) | ((x[-1] << 16) | (x[0] << 24)) + ; + ; -> Little endian: Load from x[-3] + ; -> Big endian: Load from x[-3] + BSWAP + + ; LITTLE-LABEL: name: nonzero_start_idx_negative_little_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_neg_3:_(s32) = G_CONSTANT i32 -3 + ; LITTLE: %ptr:_(p0) = COPY $x0 + ; LITTLE: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32) + ; LITTLE: %full_load:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1) + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: nonzero_start_idx_negative_little_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_neg_3:_(s32) = G_CONSTANT i32 -3 + ; BIG: %ptr:_(p0) = COPY $x0 + ; BIG: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32) + ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1) + ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_neg_1:_(s32) = G_CONSTANT i32 -1 + %cst_neg_2:_(s32) = G_CONSTANT i32 -2 + %cst_neg_3:_(s32) = G_CONSTANT i32 -3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x0 + %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32) + %ptr_elt_neg_2:_(p0) = G_PTR_ADD %ptr, %cst_neg_2(s32) + %ptr_elt_neg_1:_(p0) = G_PTR_ADD %ptr, %cst_neg_1(s32) + + %elt_neg_2:_(s32) = G_ZEXTLOAD %ptr_elt_neg_2(p0) :: (load 1) + %elt_neg_1:_(s32) = G_ZEXTLOAD %ptr_elt_neg_1(p0) :: (load 1) + %elt_0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + + %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_neg_3(p0) :: (load 1) + %byte1:_(s32) = nuw G_SHL %elt_neg_2, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt_neg_1, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt_0, %cst_24(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: nonzero_start_idx_negative_big_endian_pat +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; s8* x = ... + ; s32 y = (x[0] | (x[-1] << 8)) | ((x[-2] << 16) | (x[-3] << 24)) + ; + ; -> Little endian: Load from x[-3] + BSWAP + ; -> Big endian: Load from x[-3] + + ; LITTLE-LABEL: name: nonzero_start_idx_negative_big_endian_pat + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_neg_3:_(s32) = G_CONSTANT i32 -3 + ; LITTLE: %ptr:_(p0) = COPY $x0 + ; LITTLE: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32) + ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1) + ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: nonzero_start_idx_negative_big_endian_pat + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_neg_3:_(s32) = G_CONSTANT i32 -3 + ; BIG: %ptr:_(p0) = COPY $x0 + ; BIG: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32) + ; BIG: %full_load:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1) + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_neg_1:_(s32) = G_CONSTANT i32 -1 + %cst_neg_2:_(s32) = G_CONSTANT i32 -2 + %cst_neg_3:_(s32) = G_CONSTANT i32 -3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x0 + %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32) + %ptr_elt_neg_2:_(p0) = G_PTR_ADD %ptr, %cst_neg_2(s32) + %ptr_elt_neg_1:_(p0) = G_PTR_ADD %ptr, %cst_neg_1(s32) + + %elt_neg_3:_(s32) = G_ZEXTLOAD %ptr_elt_neg_3(p0) :: (load 1) + %elt_neg_2:_(s32) = G_ZEXTLOAD %ptr_elt_neg_2(p0) :: (load 1) + %elt_neg_1:_(s32) = G_ZEXTLOAD %ptr_elt_neg_1(p0) :: (load 1) + %elt_0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + + %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + %byte1:_(s32) = nuw G_SHL %elt_neg_1, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt_neg_2, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt_neg_3, %cst_24(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_volatile +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; Combine should only happen with unordered loads. + + ; LITTLE-LABEL: name: dont_combine_volatile + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_volatile + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_wrong_memop_size +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; Combine should only happen when the loads load the same size. + + ; LITTLE-LABEL: name: dont_wrong_memop_size + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_wrong_memop_size + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; BIG: %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_wrong_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; This is not equivalent to a 32-bit load with/without a BSWAP: + ; + ; s16 *x = ... + ; s32 y = x[0] | (x[1] << 24) + + ; LITTLE-LABEL: name: dont_combine_wrong_offset + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_wrong_offset + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_wrong_offset_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; This does not correspond to a 32-bit load with/without a BSWAP: + ; + ; s16 *x = ... + ; s32 y = x[0] | (x[1] << 8) + + ; LITTLE-LABEL: name: dont_combine_wrong_offset_2 + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_wrong_offset_2 + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_8:_(s32) = G_CONSTANT i32 8 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_missing_load +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; This is missing x[2], so we shouldn't combine: + ; + ; s16 *x = ... + ; s64 y = (x[0] | (x[1] << 16)) | (x[3] << 48) + + ; LITTLE-LABEL: name: dont_combine_missing_load + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_3:_(s64) = G_CONSTANT i64 3 + ; LITTLE: %cst_16:_(s64) = G_CONSTANT i64 16 + ; LITTLE: %cst_48:_(s64) = G_CONSTANT i64 48 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64) + ; LITTLE: %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2) + ; LITTLE: %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64) + ; LITTLE: %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64) + ; LITTLE: %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3 + ; LITTLE: %full_load:_(s64) = G_OR %or1, %byte6_byte7 + ; LITTLE: $x1 = COPY %full_load(s64) + ; LITTLE: RET_ReallyLR implicit $x1 + ; BIG-LABEL: name: dont_combine_missing_load + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_3:_(s64) = G_CONSTANT i64 3 + ; BIG: %cst_16:_(s64) = G_CONSTANT i64 16 + ; BIG: %cst_48:_(s64) = G_CONSTANT i64 48 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64) + ; BIG: %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2) + ; BIG: %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64) + ; BIG: %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64) + ; BIG: %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3 + ; BIG: %full_load:_(s64) = G_OR %or1, %byte6_byte7 + ; BIG: $x1 = COPY %full_load(s64) + ; BIG: RET_ReallyLR implicit $x1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_3:_(s64) = G_CONSTANT i64 3 + + %cst_16:_(s64) = G_CONSTANT i64 16 + %cst_48:_(s64) = G_CONSTANT i64 48 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64) + + %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2) + + %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2) + + %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64) + %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64) + + %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3 + %full_load:_(s64) = G_OR %or1, %byte6_byte7 + + $x1 = COPY %full_load(s64) + RET_ReallyLR implicit $x1 + +... +--- +name: dont_combine_different_addr_spaces +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; When the loads are from different address spaces, don't combine. + + ; LITTLE-LABEL: name: dont_combine_different_addr_spaces + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_different_addr_spaces + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, addrspace 0) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_duplicate_idx +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; If two of the G_PTR_ADDs have the same index, then don't combine. + ; + ; sN *x = ... + ; sM y = (x[i] << A) | (x[i] << B) ... + + ; LITTLE-LABEL: name: dont_combine_duplicate_idx + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1 + ; LITTLE: %reused_idx:_(s32) = G_CONSTANT i32 2 + ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; LITTLE: %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32) + ; LITTLE: %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32) + ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1) + ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1) + ; LITTLE: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + ; LITTLE: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + ; LITTLE: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1 + ; LITTLE: %or2:_(s32) = G_OR %byte2, %byte3 + ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2 + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_duplicate_idx + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1 + ; BIG: %reused_idx:_(s32) = G_CONSTANT i32 2 + ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; BIG: %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32) + ; BIG: %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32) + ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; BIG: %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1) + ; BIG: %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1) + ; BIG: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + ; BIG: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + ; BIG: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + ; BIG: %or1:_(s32) = G_OR %byte0, %byte1 + ; BIG: %or2:_(s32) = G_OR %byte2, %byte3 + ; BIG: %full_load:_(s32) = G_OR %or1, %or2 + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %reused_idx:_(s32) = G_CONSTANT i32 2 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32) + %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32) + + %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1) + + %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 +... +--- +name: dont_combine_duplicate_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; If two of the G_SHLs have the same constant, then we should not combine. + ; + ; sN *x = ... + ; sM y = (x[i] << A) | (x[i+1] << A) ... + + ; LITTLE-LABEL: name: dont_combine_duplicate_offset + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1 + ; LITTLE: %cst_2:_(s32) = G_CONSTANT i32 2 + ; LITTLE: %cst_3:_(s32) = G_CONSTANT i32 3 + ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8 + ; LITTLE: %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; LITTLE: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + ; LITTLE: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + ; LITTLE: %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32) + ; LITTLE: %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32) + ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1 + ; LITTLE: %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2 + ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2 + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_duplicate_offset + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1 + ; BIG: %cst_2:_(s32) = G_CONSTANT i32 2 + ; BIG: %cst_3:_(s32) = G_CONSTANT i32 3 + ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8 + ; BIG: %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; BIG: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; BIG: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + ; BIG: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + ; BIG: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + ; BIG: %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32) + ; BIG: %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32) + ; BIG: %or1:_(s32) = G_OR %byte0, %byte1 + ; BIG: %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2 + ; BIG: %full_load:_(s32) = G_OR %or1, %or2 + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + + %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + + %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32) + %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32) + %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_lowest_index_not_zero_offset +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; In this case, the lowest index load (e.g. x[0]) does not end up at byte + ; offset 0. We shouldn't combine. + ; + ; s8 *x = ... + ; s32 y = (x[0] << 8) | (x[1]) | (x[2] << 16) ... + + ; LITTLE-LABEL: name: dont_combine_lowest_index_not_zero_offset + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1 + ; LITTLE: %cst_2:_(s32) = G_CONSTANT i32 2 + ; LITTLE: %cst_3:_(s32) = G_CONSTANT i32 3 + ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; LITTLE: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + ; LITTLE: %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + ; LITTLE: %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32) + ; LITTLE: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + ; LITTLE: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1 + ; LITTLE: %or2:_(s32) = G_OR %byte2, %byte3 + ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2 + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_lowest_index_not_zero_offset + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1 + ; BIG: %cst_2:_(s32) = G_CONSTANT i32 2 + ; BIG: %cst_3:_(s32) = G_CONSTANT i32 3 + ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + ; BIG: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + ; BIG: %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + ; BIG: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + ; BIG: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + ; BIG: %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32) + ; BIG: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + ; BIG: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + ; BIG: %or1:_(s32) = G_OR %byte0, %byte1 + ; BIG: %or2:_(s32) = G_OR %byte2, %byte3 + ; BIG: %full_load:_(s32) = G_OR %or1, %or2 + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s32) = G_CONSTANT i32 1 + %cst_2:_(s32) = G_CONSTANT i32 2 + %cst_3:_(s32) = G_CONSTANT i32 3 + + %cst_8:_(s32) = G_CONSTANT i32 8 + %cst_16:_(s32) = G_CONSTANT i32 16 + %cst_24:_(s32) = G_CONSTANT i32 24 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32) + %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32) + %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32) + + ; This load is index 0 + %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1) + %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1) + %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1) + %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1) + + ; ... But it ends up being shifted, so we shouldn't combine. + %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32) + %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32) + %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32) + + %or1:_(s32) = G_OR %byte0, %byte1 + %or2:_(s32) = G_OR %byte2, %byte3 + %full_load:_(s32) = G_OR %or1, %or2 + + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_more_than_one_use_load +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; If any load is used more than once, don't combine. We want to remove the + ; entire tree. + + ; LITTLE-LABEL: name: dont_combine_more_than_one_use_load + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: %extra_use:_(s32) = G_AND %full_load, %low_half + ; LITTLE: $w1 = COPY %extra_use(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_more_than_one_use_load + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: %extra_use:_(s32) = G_AND %full_load, %low_half + ; BIG: $w1 = COPY %extra_use(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + %extra_use:_(s32) = G_AND %full_load, %low_half + $w1 = COPY %extra_use(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_more_than_one_use_shl +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + ; If anything feeding into any of the ors is used more than once, don't + ; combine. + + ; LITTLE-LABEL: name: dont_combine_more_than_one_use_shl + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: %extra_use:_(s32) = G_AND %full_load, %high_half + ; LITTLE: $w1 = COPY %extra_use(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_more_than_one_use_shl + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: %extra_use:_(s32) = G_AND %full_load, %high_half + ; BIG: $w1 = COPY %extra_use(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + %extra_use:_(s32) = G_AND %full_load, %high_half + $w1 = COPY %extra_use(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_store_between_same_mbb +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; If there is a store between any of the loads, then do not combine. + + ; LITTLE-LABEL: name: dont_combine_store_between_same_mbb + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: %other_ptr:_(p0) = COPY $x1 + ; LITTLE: %some_val:_(s32) = G_CONSTANT i32 12 + ; LITTLE: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2) + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_store_between_same_mbb + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: %other_ptr:_(p0) = COPY $x1 + ; BIG: %some_val:_(s32) = G_CONSTANT i32 12 + ; BIG: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2) + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + + ; Memory could be modified here, so don't combine! + %other_ptr:_(p0) = COPY $x1 + %some_val:_(s32) = G_CONSTANT i32 12 + G_STORE %some_val, %other_ptr :: (store 2) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: dont_combine_store_between_different_mbb +tracksRegLiveness: true +body: | + ; There is a store between the two loads, hidden away in a different MBB. + ; We should not combine here. + + ; LITTLE-LABEL: name: dont_combine_store_between_different_mbb + ; LITTLE: bb.0: + ; LITTLE: successors: %bb.1(0x80000000) + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: bb.1: + ; LITTLE: successors: %bb.2(0x80000000) + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %other_ptr:_(p0) = COPY $x1 + ; LITTLE: %some_val:_(s32) = G_CONSTANT i32 12 + ; LITTLE: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2) + ; LITTLE: bb.2: + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: dont_combine_store_between_different_mbb + ; BIG: bb.0: + ; BIG: successors: %bb.1(0x80000000) + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: bb.1: + ; BIG: successors: %bb.2(0x80000000) + ; BIG: liveins: $x0, $x1 + ; BIG: %other_ptr:_(p0) = COPY $x1 + ; BIG: %some_val:_(s32) = G_CONSTANT i32 12 + ; BIG: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2) + ; BIG: bb.2: + ; BIG: liveins: $x0, $x1 + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + + bb.0: + successors: %bb.1(0x80000000) + liveins: $x0, $x1 + ; If there is a store between any of the loads, then do not combine. + + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + + bb.1: + liveins: $x0, $x1 + successors: %bb.2(0x80000000) + ; Memory could be modified here, so don't combine! + %other_ptr:_(p0) = COPY $x1 + %some_val:_(s32) = G_CONSTANT i32 12 + G_STORE %some_val, %other_ptr :: (store 2) + + bb.2: + liveins: $x0, $x1 + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: different_mbb +tracksRegLiveness: true +body: | + ; It should be possible to combine here, but it's not supported right now. + + ; LITTLE-LABEL: name: different_mbb + ; LITTLE: bb.0: + ; LITTLE: successors: %bb.1(0x80000000) + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1 + ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; LITTLE: bb.1: + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: different_mbb + ; BIG: bb.0: + ; BIG: successors: %bb.1(0x80000000) + ; BIG: liveins: $x0, $x1 + ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1 + ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + ; BIG: bb.1: + ; BIG: liveins: $x0, $x1 + ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + + bb.0: + successors: %bb.1(0x80000000) + liveins: $x0, $x1 + + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + + bb.1: + liveins: $x0, $x1 + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1 + +... +--- +name: load_first +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; Test for a bug fix for predecessor-checking code. + + ; LITTLE-LABEL: name: load_first + ; LITTLE: liveins: $x0, $x1 + ; LITTLE: %ptr:_(p0) = COPY $x1 + ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; LITTLE: $w1 = COPY %full_load(s32) + ; LITTLE: RET_ReallyLR implicit $w1 + ; BIG-LABEL: name: load_first + ; BIG: liveins: $x0, $x1 + ; BIG: %ptr:_(p0) = COPY $x1 + ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2) + ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]] + ; BIG: $w1 = COPY %full_load(s32) + ; BIG: RET_ReallyLR implicit $w1 + %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2) + %cst_1:_(s64) = G_CONSTANT i64 1 + %cst_16:_(s32) = G_CONSTANT i32 16 + + %ptr:_(p0) = COPY $x1 + %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64) + + %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2) + %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32) + + %full_load:_(s32) = G_OR %low_half, %high_half + $w1 = COPY %full_load(s32) + RET_ReallyLR implicit $w1