Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1155,6 +1155,9 @@ /// \return The associativity of the cache level, if available. std::optional getCacheAssociativity(CacheLevel Level) const; + /// \return The minimum architectural page size for the target. + std::optional getMinPageSize() const; + /// \return How much before a load we should place the prefetch /// instruction. This is currently measured in number of /// instructions. @@ -1889,6 +1892,7 @@ virtual std::optional getCacheSize(CacheLevel Level) const = 0; virtual std::optional getCacheAssociativity(CacheLevel Level) const = 0; + virtual std::optional getMinPageSize() const = 0; /// \return How much before a load we should place the prefetch /// instruction. This is currently measured in number of @@ -2475,6 +2479,10 @@ return Impl.getCacheAssociativity(Level); } + std::optional getMinPageSize() const override { + return Impl.getMinPageSize(); + } + /// Return the preferred prefetch distance in terms of instructions. /// unsigned getPrefetchDistance() const override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -494,6 +494,8 @@ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } + std::optional getMinPageSize() const { return {}; } + unsigned getPrefetchDistance() const { return 0; } unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -753,6 +753,10 @@ return TTIImpl->getCacheAssociativity(Level); } +std::optional TargetTransformInfo::getMinPageSize() const { + return TTIImpl->getMinPageSize(); +} + unsigned TargetTransformInfo::getPrefetchDistance() const { return TTIImpl->getPrefetchDistance(); } Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -88,6 +88,7 @@ void initializeAArch64ExpandPseudoPass(PassRegistry &); void initializeAArch64GlobalsTaggingPass(PassRegistry &); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); Index: llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h @@ -0,0 +1,25 @@ +//===- AArch64LoopIdiomTransform.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +struct AArch64LoopIdiomTransformPass + : PassInfoMixin { + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H Index: llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp @@ -0,0 +1,726 @@ + +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lit" + +static cl::opt + DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( + "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), + cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, Value *Start, + Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + Value *MaxLen, Value *Index, Value *Start, + bool IncIdx, BasicBlock *FoundBB, + BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { + initializeAArch64LoopIdiomTransformLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Recognize AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, + LPPassManager &LPM) { + + if (skipLoop(L)) + return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( + AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", + "Transform specific loop idioms into optimised vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( + AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", + "Transform specific loop idioms into optimised vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { + return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) + return PreservedAnalyses::all(); + + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + if (!LIT.run(&L)) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===----------------------------------------------------------------------===// +// +// Implementation of AArch64LoopIdiomTransform +// +//===----------------------------------------------------------------------===// + +bool AArch64LoopIdiomTransform::run(Loop *L) { + CurLoop = L; + + if (DisableAll) + return false; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + + return recognizeByteCompare(); +} + +/// Match loop-invariant value. +template struct match_LoopInvariant { + SubPattern_t SubPattern; + const Loop *L; + + match_LoopInvariant(const SubPattern_t &SP, const Loop *L) + : SubPattern(SP), L(L) {} + + template bool match(ITy *V) { + return L->isLoopInvariant(V) && SubPattern.match(V); + } +}; + +/// Matches if the value is loop-invariant. +template +inline match_LoopInvariant m_LoopInvariant(const Ty &M, const Loop *L) { + return match_LoopInvariant(M, L); +} + +bool AArch64LoopIdiomTransform::recognizeByteCompare() { + if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || + DisableByteCmp) + return false; + + BasicBlock *Header = CurLoop->getHeader(); + BasicBlock *PH = CurLoop->getLoopPreheader(); + + // In AArch64LoopIdiomTransform::run we have already checked that the loop + // has a preheader so we can assume it's in a canonical form. + auto *EntryBI = cast(PH->getTerminator()); + + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) + return false; + + PHINode *PN = dyn_cast(&Header->front()); + if (!PN || PN->getNumIncomingValues() != 2) + return false; + + auto LoopBlocks = CurLoop->getBlocks(); + // The first block in the loop should contain only 4 instructions, e.g. + // + // while.cond: + // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] + // %inc = add i32 %res.phi, 1 + // %cmp.not = icmp eq i32 %inc, %n + // br i1 %cmp.not, label %while.end, label %while.body + // + auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); + if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4) + return false; + + // The second block should contain 7 instructions, e.g. + // + // while.body: + // %idx = zext i32 %inc to i64 + // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx + // %load.a = load i8, ptr %idx.a + // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx + // %load.b = load i8, ptr %idx.b + // %cmp.not.ld = icmp eq i8 %load.a, %load.b + // br i1 %cmp.not.ld, label %while.cond, label %while.end + // + auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); + if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7) + return false; + + using namespace PatternMatch; + + // The incoming value to the PHI node from the loop should be an add of 1. + Instruction *Index = nullptr; + Value *StartIdx = nullptr; + for (BasicBlock *BB : PN->blocks()) { + if (!CurLoop->contains(BB)) { + StartIdx = PN->getIncomingValueForBlock(BB); + continue; + } + Index = dyn_cast(PN->getIncomingValueForBlock(BB)); + // Limit to 32-bit types for now + if (!Index || !Index->getType()->isIntegerTy(32) || + !match(Index, m_c_Add(m_Specific(PN), m_One()))) + return false; + } + + // If we match the pattern, PN and Index will be replaced with the result of + // the cttz.elts intrinsic. If any other instructions are used outside of + // the loop, we cannot replace it. + for (BasicBlock *BB : LoopBlocks) + for (Instruction &I : *BB) + if (&I != PN && &I != Index) + for (User *U : I.users()) { + auto UI = cast(U); + if (!CurLoop->contains(UI)) + return false; + } + + // Don't replace the loop if the add has a wrap flag. + if (Index->hasNoSignedWrap() || Index->hasNoUnsignedWrap()) + return false; + + // Match the branch instruction for the header + ICmpInst::Predicate Pred; + Value *MaxLen; + BasicBlock *EndBB, *WhileBB; + if (!match(Header->getTerminator(), + m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)), + m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) || + Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB)) + return false; + + // WhileBB should contain the pattern of load & compare instructions. Match + // the pattern and find the GEP instructions used by the loads. + ICmpInst::Predicate WhilePred; + BasicBlock *FoundBB; + BasicBlock *TrueBB; + Value *LoadA, *LoadB; + if (!match(WhileBB->getTerminator(), + m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)), + m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) || + WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB)) + return false; + + Value *A, *B; + if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B)))) + return false; + + GetElementPtrInst *GEPA = dyn_cast(A); + GetElementPtrInst *GEPB = dyn_cast(B); + + if (!GEPA || !GEPB) + return false; + + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Check we are loading i8 values from two loop invariant pointers + if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) || + !GEPA->getResultElementType()->isIntegerTy(8) || + !GEPB->getResultElementType()->isIntegerTy(8) || + !cast(LoadA)->getType()->isIntegerTy(8) || + !cast(LoadB)->getType()->isIntegerTy(8) || PtrA == PtrB) + return false; + + // Check that the index to the GEPs is the index we found earlier + if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) + return false; + + Value *IdxA = GEPA->getOperand(GEPA->getNumIndices()); + Value *IdxB = GEPB->getOperand(GEPB->getNumIndices()); + if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index)))) + return false; + + LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" + << *(EndBB->getParent()) << "\n\n"); + + // The index is incremented before the GEP/Load pair so we need to + // add 1 to the start value. + transformByteCompare(GEPA, GEPB, MaxLen, Index, StartIdx, /*IncIdx=*/true, FoundBB, + EndBB); + return true; +} + +Value *AArch64LoopIdiomTransform::expandFindMismatch(IRBuilder<> &Builder, + GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, + Value *Start, + Value *MaxLen) { + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Get the arguments and types for the intrinsic. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BranchInst *PHBranch = cast(Preheader->getTerminator()); + LLVMContext &Ctx = PHBranch->getContext(); + Type *LoadType = Type::getInt8Ty(Ctx); + Type *ResType = Builder.getInt32Ty(); + + // Split block in the original loop preheader. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + BasicBlock *EndBlock = + SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); + + // Create the blocks that we're going to need: + // 1. A block for checking the zero-extended length exceeds 0 + // 2. A block to check that the start and end addresses of a given array + // lie on the same page. + // 3. The SVE loop preheader. + // 4. The first SVE loop block. + // 5. The SVE loop increment block. + // 6. A block we can jump to from the SVE loop when a mismatch is found. + // 7. The first block of the scalar loop itself, containing PHIs , loads + // and cmp. + // 8. A scalar loop increment block to increment the PHIs and go back + // around the loop. + + BasicBlock *MinItCheckBlock = BasicBlock::Create( + Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock); + + // Update the terminator added by SplitBlock to branch to the first block + Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock); + + BasicBlock *MemCheckBlock = BasicBlock::Create( + Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopStartBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopMismatchBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( + Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopStartBlock = + BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock); + + DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, + {DominatorTree::Delete, Preheader, EndBlock}}); + + // Update LoopInfo with the new SVE & scalar loops. + auto SVELoop = LI->AllocateLoop(); + auto ScalarLoop = LI->AllocateLoop(); + if (CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->addChildLoop(SVELoop); + CurLoop->getParentLoop()->addChildLoop(ScalarLoop); + } else { + LI->addTopLevelLoop(SVELoop); + LI->addTopLevelLoop(ScalarLoop); + } + + // Add the new basic blocks to their associated loops. + SVELoop->addBasicBlockToLoop(MinItCheckBlock, *LI); + SVELoop->addBasicBlockToLoop(MemCheckBlock, *LI); + SVELoop->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI); + SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI); + SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI); + SVELoop->addBasicBlockToLoop(SVELoopMismatchBlock, *LI); + + ScalarLoop->addBasicBlockToLoop(LoopPreHeaderBlock, *LI); + ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI); + ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI); + + // Set up some types and constants that we intend to reuse. + Type *I64Type = Builder.getInt64Ty(); + + // Check the zero-extended iteration count > 0 + Builder.SetInsertPoint(MinItCheckBlock); + Value *ExtStart = Builder.CreateZExt(Start, I64Type); + Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type); + // This check doesn't really cost us very much. + + Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen); + BranchInst *MinItCheckBr = + BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck); + MinItCheckBr->setMetadata( + LLVMContext::MD_prof, + MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); + Builder.Insert(MinItCheckBr); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, + {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); + + // For each of the arrays, check the start/end addresses are on the same + // page. + Builder.SetInsertPoint(MemCheckBlock); + + // For each start address calculate the offset into the min architecturally + // allowed page size. Then determine how many bytes there are left on the + // page and see if this is >= MaxLen. + Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart); + Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart); + Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type); + Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type); + Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd); + Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd); + Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type); + Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type); + + const uint64_t MinPageSize = TTI->getMinPageSize().value(); + const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); + Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt); + Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt); + Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt); + Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt); + Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage); + Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage); + + Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp); + BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( + LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp); + CombinedPageCmpCmpBr->setMetadata( + LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) + .createBranchWeights(10, 90)); + Builder.Insert(CombinedPageCmpCmpBr); + + DTU.applyUpdates( + {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, + {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}}); + + // Set up the SVE loop preheader, i.e. calculate initial loop predicate, + // zero-extend MaxLen to 64-bits, determine the number of vector elements + // processed in each iteration, etc. + Builder.SetInsertPoint(SVELoopPreheaderBlock); + + // At this point we know two things must be true: + // 1. Start <= End + // 2. ExtMaxLen <= 4096 due to the page checks. + // Therefore, we know that we can use a 64-bit induction variable that + // starts from 0 -> ExtMaxLen and it will not overflow. + ScalableVectorType *PredVTy = + ScalableVectorType::get(Builder.getInt1Ty(), 16); + + Value *InitialPred = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); + + Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); + VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", + /*HasNUW=*/true, /*HasNSW=*/true); + + Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), + Builder.getInt1(false)); + + BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock); + Builder.Insert(JumpToSVELoop); + + DTU.applyUpdates( + {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}}); + + // Set up the first SVE loop block by creating the PHIs, doing the vector + // loads and comparing the vectors. + Builder.SetInsertPoint(SVELoopStartBlock); + PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred"); + LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock); + PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index"); + SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock); + Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); + Value *GepOffset = SVEIndexPhi; + Value *Passthru = ConstantInt::getNullValue(SVELoadType); + + Value *SVELhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); + if (GEPA->isInBounds()) + cast(SVELhsGep)->setIsInBounds(true); + Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1), + LoopPred, Passthru); + + Value *SVERhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); + if (GEPB->isInBounds()) + cast(SVERhsGep)->setIsInBounds(true); + Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1), + LoopPred, Passthru); + + Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad); + SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse); + Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp); + BranchInst *SVEEarlyExit = BranchInst::Create( + SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes); + Builder.Insert(SVEEarlyExit); + + DTU.applyUpdates( + {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock}, + {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}}); + + // Increment the index counter and calculate the predicate for the next + // iteration of the loop. We branch back to the start of the loop if there + // is at least one active lane. + Builder.SetInsertPoint(SVELoopIncBlock); + Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "", + /*HasNUW=*/true, /*HasNSW=*/true); + SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock); + Value *NewPred = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd}); + LoopPred->addIncoming(NewPred, SVELoopIncBlock); + + Value *PredHasActiveLanes = + Builder.CreateExtractElement(NewPred, uint64_t(0)); + BranchInst *SVELoopBranchBack = + BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes); + Builder.Insert(SVELoopBranchBack); + + DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock}, + {DominatorTree::Insert, SVELoopIncBlock, EndBlock}}); + + // If we found a mismatch then we need to calculate which lane in the vector + // had a mismatch and add that on to the current loop index. + Builder.SetInsertPoint(SVELoopMismatchBlock); + Value *PredMatchCmp = Builder.CreateAnd(LoopPred, SVEMatchCmp); + Value *Ctz = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {ResType, SVEMatchCmp->getType()}, + {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Ctz = Builder.CreateZExt(Ctz, I64Type); + Value *SVELoopRes64 = Builder.CreateAdd(SVEIndexPhi, Ctz, "", + /*HasNUW=*/true, /*HasNSW=*/true); + Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType); + + Builder.Insert(BranchInst::Create(EndBlock)); + + DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}}); + + // Generate code for scalar loop. + Builder.SetInsertPoint(LoopPreHeaderBlock); + Builder.Insert(BranchInst::Create(LoopStartBlock)); + + DTU.applyUpdates( + {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); + + Builder.SetInsertPoint(LoopStartBlock); + PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); + IndexPhi->addIncoming(Start, LoopPreHeaderBlock); + + // Otherwise compare the values + // Load bytes from each array and compare them. + GepOffset = Builder.CreateZExt(IndexPhi, I64Type); + + Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); + if (GEPA->isInBounds()) + cast(LhsGep)->setIsInBounds(true); + Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep); + + Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); + if (GEPB->isInBounds()) + cast(RhsGep)->setIsInBounds(true); + Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep); + + Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad); + // If we have a mismatch then exit the loop ... + BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); + Builder.Insert(MatchCmpBr); + + DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, + {DominatorTree::Insert, LoopStartBlock, EndBlock}}); + + // Have we reached the maximum permitted length for the loop? + Builder.SetInsertPoint(LoopIncBlock); + Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1)); + IndexPhi->addIncoming(PhiInc, LoopIncBlock); + Value *IVCmp = Builder.CreateICmpEQ(IndexPhi, MaxLen); + BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); + Builder.Insert(IVCmpBr); + + DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, + {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); + + // In the end block we need to insert a PHI node to deal with three cases: + // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. + // 2. We exitted the scalar loop early due to a mismatch and need to return + // the index that we found. + // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen. + // 4. We exitted the SVE loop early due to a mismatch and need to return + // the index that we found. + Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); + PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); + ResPhi->addIncoming(MaxLen, LoopIncBlock); + ResPhi->addIncoming(IndexPhi, LoopStartBlock); + ResPhi->addIncoming(MaxLen, SVELoopIncBlock); + ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock); + + return Builder.CreateTrunc(ResPhi, ResType); +} + +void AArch64LoopIdiomTransform::transformByteCompare( + GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Value *MaxLen, + Value *Index, Value *Start, bool IncIdx, BasicBlock *FoundBB, + BasicBlock *EndBB) { + + // Insert the byte compare intrinsic at the end of the preheader block + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BasicBlock *Header = CurLoop->getHeader(); + BranchInst *PHBranch = cast(Preheader->getTerminator()); + IRBuilder<> Builder(PHBranch); + Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + + // Increment the pointer if this was done before the loads in the loop. + if (IncIdx) + Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); + + Value *ByteCmpRes = expandFindMismatch(Builder, GEPA, GEPB, Start, MaxLen); + + // Replaces uses of index & induction Phi with intrinsic (we already + // checked that the the first instruction of Header is the Phi above). + auto IndPhi = &Header->front(); + IndPhi->replaceAllUsesWith(ByteCmpRes); + Index->replaceAllUsesWith(ByteCmpRes); + + assert(PHBranch->isUnconditional() && + "Expected preheader to terminate with an unconditional branch."); + + // If no mismatch was found, we can jump to the end block. Create a + // new basic block for the compare instruction. + auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare", + Preheader->getParent()); + CmpBB->moveBefore(EndBB); + + // Replace the branch in the preheader with an always-true conditional branch. + // This ensures there is still a reference to the original loop. + Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header); + PHBranch->eraseFromParent(); + + // Create the branch to either the end or found block depending on the value + // returned by the intrinsic. + Builder.SetInsertPoint(CmpBB); + Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); + Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); + + auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { + for (PHINode &PN : SuccBB->phis()) { + // At this point we've already replaced all uses of the result from the + // loop with ByteCmp. Look through the incoming values to find ByteCmp, + // meaning this is a Phi collecting the results of the byte compare. + bool ResPhi = false; + for (Value *Op : PN.incoming_values()) + if (Op == CmpBB) + ResPhi = true; + + // If any of the incoming values were ByteCmp, we need to also add + // it as an incoming value from CmpBB. + if (ResPhi) + PN.addIncoming(ByteCmpRes, CmpBB); + else { + // Otherwise, this is a Phi for different values. We should create + // a new incoming value from CmpBB matching the same value as from + // the old loop. + for (BasicBlock *BB : PN.blocks()) + if (CurLoop->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); + break; + } + } + } + }; + + // Ensure all Phis in the successors of CmpBB have an incoming value from it. + fixSuccessorPhis(EndBB); + fixSuccessorPhis(FoundBB); + + // The new CmpBB block isn't part of the loop, but will need to be added to + // the outer loop if there is one. + if (!CurLoop->isOutermost()) + CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI); + + // Update the dominator tree with the new block. + DT->addNewBlock(CmpBB, Preheader); +} Index: llvm/lib/Target/AArch64/AArch64TargetMachine.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H #include "AArch64InstrInfo.h" +#include "AArch64LoopIdiomTransform.h" #include "AArch64Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" @@ -43,6 +44,8 @@ // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; TargetLoweringObjectFile* getObjFileLowering() const override { Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,6 +11,7 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" +#include "AArch64LoopIdiomTransform.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64MachineScheduler.h" #include "AArch64MacroFusion.h" @@ -43,6 +44,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -222,6 +224,7 @@ initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); + initializeAArch64LoopIdiomTransformLegacyPassPass(*PR); initializeAArch64MIPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); @@ -535,6 +538,13 @@ } // end anonymous namespace +void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { + PB.registerLateLoopOptimizationsEPCallback( + [=](LoopPassManager &LPM, OptimizationLevel Level) { + LPM.addPass(AArch64LoopIdiomTransformPass()); + }); +} + TargetTransformInfo AArch64TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(AArch64TTIImpl(this, F)); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -419,6 +419,8 @@ return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); } + + std::optional getMinPageSize() const { return 4096; } }; } // end namespace llvm Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -64,6 +64,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64LoopIdiomTransform.cpp AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MachineScheduler.cpp Index: llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll @@ -0,0 +1,1640 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -aarch64-lit -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s +; RUN: opt -aarch64-lit -simplifycfg -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL +; RUN: opt -aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM + +define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP27]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MISMATCH_RESULT]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP47]], label [[WHILE_END]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP27]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; LOOP-DEL-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_umin(ptr %a, ptr %b, i32 %len, i32 %n, i32 %idx1, i32 %idx2) { +; CHECK-LABEL: define i32 @compare_bytes_umin( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[IDX1:%.*]], i32 [[IDX2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[PH:%.*]] +; CHECK: ph: +; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.umin.i32(i32 [[IDX1]], i32 [[IDX2]]) +; CHECK-NEXT: [[EXT:%.*]] = zext i32 [[START]] to i64 +; CHECK-NEXT: [[A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A0]], align 1 +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EXT]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[A1]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_COND_PREHEADER:%.*]], label [[WHILE_END:%.*]] +; CHECK: while.cond.preheader: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[START]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ule i32 [[TMP2]], [[N]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP6]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP11]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP9]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP12]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP8]], 12 +; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP13]], 12 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[TMP4]]) +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP23:%.*]] = mul nuw nsw i64 [[TMP22]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP21]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP32:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP3]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP31:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP27:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP28:%.*]] = icmp ne [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP29]]) +; CHECK-NEXT: br i1 [[TMP30]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP31]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP23]] +; CHECK-NEXT: [[TMP32]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP31]], i64 [[TMP4]]) +; CHECK-NEXT: [[TMP33:%.*]] = extractelement [[TMP32]], i64 0 +; CHECK-NEXT: br i1 [[TMP33]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[TMP34:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP29]] +; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP34]], i1 true) +; CHECK-NEXT: [[TMP36:%.*]] = zext i32 [[TMP35]] to i64 +; CHECK-NEXT: [[TMP37:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = trunc i64 [[TMP37]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP2]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP45:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i8 [[TMP41]], [[TMP43]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP45]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; CHECK-NEXT: br i1 [[TMP46]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP38]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_PHI:%.*]] = phi i32 [ [[START]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MISMATCH_RESULT]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[IDX_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP47:%.*]] = load i8, ptr [[IDX_A]], align 1 +; CHECK-NEXT: [[IDX_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP48:%.*]] = load i8, ptr [[IDX_B]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP47]], [[TMP48]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP49]], label [[WHILE_END]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[N]], [[PH]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_umin( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[IDX1:%.*]], i32 [[IDX2:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[START:%.*]] = call i32 @llvm.umin.i32(i32 [[IDX1]], i32 [[IDX2]]) +; LOOP-DEL-NEXT: [[EXT:%.*]] = zext i32 [[START]] to i64 +; LOOP-DEL-NEXT: [[A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EXT]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[A0]], align 1 +; LOOP-DEL-NEXT: [[A1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EXT]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[A1]], align 1 +; LOOP-DEL-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP]], label [[WHILE_COND_PREHEADER:%.*]], label [[WHILE_END:%.*]] +; LOOP-DEL: while.cond.preheader: +; LOOP-DEL-NEXT: [[TMP2:%.*]] = add i32 [[START]], 1 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; LOOP-DEL-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ule i32 [[TMP2]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP5]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] +; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LOOP-DEL-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP6]] to i64 +; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; LOOP-DEL-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; LOOP-DEL-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP11]] to i64 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP9]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP12]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP8]], 12 +; LOOP-DEL-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP13]], 12 +; LOOP-DEL-NEXT: [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] +; LOOP-DEL-NEXT: br i1 [[TMP20]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP21:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP3]], i64 [[TMP4]]) +; LOOP-DEL-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP23:%.*]] = mul nuw nsw i64 [[TMP22]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP21]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP32:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP3]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP31:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP26]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP28:%.*]] = icmp ne [[TMP25]], [[TMP27]] +; LOOP-DEL-NEXT: [[TMP29:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP28]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP30:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP29]]) +; LOOP-DEL-NEXT: br i1 [[TMP30]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP31]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP23]] +; LOOP-DEL-NEXT: [[TMP32]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP31]], i64 [[TMP4]]) +; LOOP-DEL-NEXT: [[TMP33:%.*]] = extractelement [[TMP32]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP33]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[TMP34:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP29]] +; LOOP-DEL-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP34]], i1 true) +; LOOP-DEL-NEXT: [[TMP36:%.*]] = zext i32 [[TMP35]] to i64 +; LOOP-DEL-NEXT: [[TMP37:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP36]] +; LOOP-DEL-NEXT: [[TMP38:%.*]] = trunc i64 [[TMP37]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP2]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP45:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP39]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP39]] +; LOOP-DEL-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i8 [[TMP41]], [[TMP43]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP45]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP46:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP46]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[RES:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP38]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[RES]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_umin( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[IDX1:%.*]], i32 [[IDX2:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[PH:%.*]] +; NO-TRANSFORM: ph: +; NO-TRANSFORM-NEXT: [[START:%.*]] = call i32 @llvm.umin.i32(i32 [[IDX1]], i32 [[IDX2]]) +; NO-TRANSFORM-NEXT: [[EXT:%.*]] = zext i32 [[START]] to i64 +; NO-TRANSFORM-NEXT: [[A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EXT]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[A0]], align 1 +; NO-TRANSFORM-NEXT: [[A1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EXT]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[A1]], align 1 +; NO-TRANSFORM-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP]], label [[WHILE_COND_PREHEADER:%.*]], label [[WHILE_END:%.*]] +; NO-TRANSFORM: while.cond.preheader: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_PHI:%.*]] = phi i32 [ [[START]], [[WHILE_COND_PREHEADER]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_PHI]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[IDX_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP2:%.*]] = load i8, ptr [[IDX_A]], align 1 +; NO-TRANSFORM-NEXT: [[IDX_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP3:%.*]] = load i8, ptr [[IDX_B]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP2]], [[TMP3]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[N]], [[PH]] ], [ [[INC]], [[WHILE_COND]] ], [ [[INC]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: ret i32 [[RES]] +; +entry: + br label %ph + +ph: + %start = call i32 @llvm.umin.i32(i32 %idx1, i32 %idx2) + %ext = zext i32 %start to i64 + %a0 = getelementptr inbounds i8, ptr %a, i64 %ext + %0 = load i8, ptr %a0, align 1 + %a1 = getelementptr inbounds i8, ptr %b, i64 %ext + %1 = load i8, ptr %a1, align 1 + %cmp = icmp eq i8 %0, %1 + br i1 %cmp, label %while.cond.preheader, label %while.end + +while.cond.preheader: + br label %while.cond + +while.cond: + %len.phi = phi i32 [ %start, %while.cond.preheader ], [ %inc, %while.body ] + %inc = add i32 %len.phi, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %idx.a = getelementptr inbounds i8, ptr %a, i64 %idxprom + %2 = load i8, ptr %idx.a, align 1 + %idx.b = getelementptr inbounds i8, ptr %b, i64 %idxprom + %3 = load i8, ptr %idx.b, align 1 + %cmp.not2 = icmp eq i8 %2, %3 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %res = phi i32 [ %n, %ph], [ %inc, %while.cond], [ %inc, %while.body ] + ret i32 %res +} + +declare i32 @llvm.umin.i32(i32, i32); + +define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { +; CHECK-LABEL: define i32 @compare_bytes_extra_cmp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; CHECK-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; CHECK: ph: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP27]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MISMATCH_RESULT]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP47]], label [[WHILE_END]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; LOOP-DEL-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; LOOP-DEL: ph: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP27]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; LOOP-DEL-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; NO-TRANSFORM: ph: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + %cmp.x = icmp ult i32 %n, %x + br i1 %cmp.x, label %ph, label %while.end + +ph: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ] + ret i32 %inc.lcssa +} + +define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) { +; CHECK-LABEL: define void @compare_bytes_cleanup_block( +; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP3]], 12 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP2]], 12 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 0) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw nsw i64 [[TMP16]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP15]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne [[TMP19]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP23]]) +; CHECK-NEXT: br i1 [[TMP24]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP17]] +; CHECK-NEXT: [[TMP26]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP25]], i64 0) +; CHECK-NEXT: [[TMP27:%.*]] = extractelement [[TMP26]], i64 0 +; CHECK-NEXT: br i1 [[TMP27]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[TMP28:%.*]] = and [[MISMATCH_SVE_LOOP_PRED]], [[TMP23]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP28]], i1 true) +; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP39:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP33:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i8 [[TMP35]], [[TMP37]] +; CHECK-NEXT: br i1 [[TMP38]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP39]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[MISMATCH_RESULT]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; CHECK-NEXT: br i1 [[TMP43]], label [[CLEANUP_THREAD]], label [[IF_END]] +; CHECK: cleanup.thread: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret void +; +; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block( +; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]] +; LOOP-DEL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]] +; LOOP-DEL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 +; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]] +; LOOP-DEL-NEXT: [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0 +; LOOP-DEL-NEXT: [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]] +; LOOP-DEL-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]] +; LOOP-DEL: common.ret: +; LOOP-DEL-NEXT: ret void +; +; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block( +; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0 +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; NO-TRANSFORM: cleanup.thread: +; NO-TRANSFORM-NEXT: ret void +; NO-TRANSFORM: if.end: +; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[LEN]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: ret void +; +entry: + br label %while.cond + +while.cond: + %len = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %inc = add i32 %len, 1 + %cmp.not = icmp eq i32 %inc, 0 + br i1 %cmp.not, label %cleanup.thread, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom + %1 = load i8, ptr %arrayidx2, align 1 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %if.end + +cleanup.thread: + ret void + +if.end: + %res = phi i32 [ %len, %while.body ] + ret void +} + +; +; NEGATIVE TESTS +; + +define i32 @compare_bytes_sign_ext(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_sign_ext( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_sign_ext( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_sign_ext( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = sext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_signed_wrap( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add nsw i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_outside_uses(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_outside_uses( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[CMP_NOT]], [[WHILE_BODY]] ], [ [[CMP_NOT]], [[WHILE_COND]] ] +; CHECK-NEXT: [[EXT_RES:%.*]] = zext i1 [[RES]] to i32 +; CHECK-NEXT: ret i32 [[EXT_RES]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_outside_uses( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[IV]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[RES:%.*]] = phi i1 [ [[CMP_NOT]], [[WHILE_BODY]] ], [ [[CMP_NOT]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: [[EXT_RES:%.*]] = zext i1 [[RES]] to i32 +; LOOP-DEL-NEXT: ret i32 [[EXT_RES]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_outside_uses( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[IV]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i1 [ [[CMP_NOT]], [[WHILE_BODY]] ], [ [[CMP_NOT]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[EXT_RES:%.*]] = zext i1 [[RES]] to i32 +; NO-TRANSFORM-NEXT: ret i32 [[EXT_RES]] +; +entry: + br label %while.cond + +while.cond: + %iv = phi i32 [ 0, %entry ], [ %inc, %while.body ] + %inc = add i32 %iv, 1 + %cmp.not = icmp eq i32 %inc, %len + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %res = phi i1 [ %cmp.not, %while.body ], [ %cmp.not, %while.cond ] + %ext_res = zext i1 %res to i32 + ret i32 %ext_res +} + +define i64 @compare_bytes_i64_index(ptr %a, ptr %b, i64 %len, i64 %n) { +; CHECK-LABEL: define i64 @compare_bytes_i64_index( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[LEN:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i64 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i64 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INC]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INC]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i64 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i64 @compare_bytes_i64_index( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[LEN:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i64 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i64 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INC]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INC]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i64 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i64 @compare_bytes_i64_index( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[LEN:%.*]], i64 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i64 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i64 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INC]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INC]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i64 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i64 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i64 %len.addr, 1 + %cmp.not = icmp eq i64 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %inc + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %inc + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i64 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i64 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp1(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple_wrong_icmp1( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_wrong_icmp1( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_wrong_icmp1( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp ne i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp2(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple_wrong_icmp2( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_BODY]], label [[WHILE_END:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_wrong_icmp2( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_BODY]], label [[WHILE_END:%.*]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_wrong_icmp2( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_BODY]], label [[WHILE_END:%.*]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.body, label %while.end + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp3(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple_wrong_icmp3( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp ne i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_wrong_icmp3( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp ne i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_wrong_icmp3( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp ne i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp ne i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp4(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple_wrong_icmp4( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END]], label [[WHILE_COND]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_wrong_icmp4( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END]], label [[WHILE_COND]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_wrong_icmp4( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END]], label [[WHILE_COND]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.end, label %while.cond + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_bad_load_type(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_bad_load_type( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_bad_load_type( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_bad_load_type( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i16, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i16, ptr %arrayidx2 + %cmp.not2 = icmp eq i16 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} Index: llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn =================================================================== --- llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -129,6 +129,7 @@ "AArch64ISelLowering.cpp", "AArch64InstrInfo.cpp", "AArch64LoadStoreOptimizer.cpp", + "AArch64LoopIdiomTransform.cpp", "AArch64LowerHomogeneousPrologEpilog.cpp", "AArch64MCInstLower.cpp", "AArch64MIPeepholeOpt.cpp",