diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -592,17 +592,35 @@ /// Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; - /// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is - /// true if this is the expansion of memcmp(p1, p2, s) == 0. + /// Returns options for expansion of memcmp. IsZeroCmp is + // true if this is the expansion of memcmp(p1, p2, s) == 0. struct MemCmpExpansionOptions { + // Return true if memcmp expansion is enabled. + operator bool() const { return MaxNumLoads > 0; } + + // Maximum number of load operations. + unsigned MaxNumLoads = 0; + // The list of available load sizes (in bytes), sorted in decreasing order. SmallVector LoadSizes; + + // For memcmp expansion when the memcmp result is only compared equal or + // not-equal to 0, allow up to this number of load pairs per block. As an + // example, this may allow 'memcmp(a, b, 3) == 0' in a single block: + // a0 = load2bytes &a[0] + // b0 = load2bytes &b[0] + // a2 = load1byte &a[2] + // b2 = load1byte &b[2] + // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0 + unsigned NumLoadsPerBlock = 1; + // Set to true to allow overlapping loads. For example, 7-byte compares can // be done with two 4-byte compares instead of 4+2+1-byte compares. This // requires all loads in LoadSizes to be doable in an unaligned way. bool AllowOverlappingLoads = false; }; - const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const; + MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -1113,8 +1131,8 @@ unsigned VF) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; - virtual const MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const = 0; + virtual MemCmpExpansionOptions + enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; @@ -1402,9 +1420,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } - const MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const override { - return Impl.enableMemCmpExpansion(IsZeroCmp); + MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const override { + return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp); } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -275,9 +275,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } - const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const { - return nullptr; + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const { + return {}; } bool enableInterleavedAccessVectorization() { return false; } diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -434,9 +434,6 @@ /// shuffles. FunctionPass *createExpandReductionsPass(); - // This pass expands memcmp() to load/stores. - FunctionPass *createExpandMemCmpPass(); - /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp FunctionPass *createBreakFalseDeps(); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1340,18 +1340,6 @@ return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; } - /// For memcmp expansion when the memcmp result is only compared equal or - /// not-equal to 0, allow up to this number of load pairs per block. As an - /// example, this may allow 'memcmp(a, b, 3) == 0' in a single block: - /// a0 = load2bytes &a[0] - /// b0 = load2bytes &b[0] - /// a2 = load1byte &a[2] - /// b2 = load1byte &b[2] - /// r = cmp eq (a0 ^ b0 | a2 ^ b2), 0 - virtual unsigned getMemcmpEqZeroLoadsPerBlock() const { - return 1; - } - /// Get maximum # of store operations permitted for llvm.memmove /// /// This function returns the maximum number of store operations permitted diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h --- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -191,6 +191,7 @@ void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const; + void addMemcmpPasses(legacy::PassManagerBase &MPM) const; public: /// populateFunctionPassManager - This fills in the function pass manager, diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -369,6 +369,12 @@ // Pass *createMergeICmpsPass(); +//===----------------------------------------------------------------------===// +// +// ExpandMemCmp - This pass expands memcmp() to load/stores. +// +Pass *createExpandMemCmpPass(); + //===----------------------------------------------------------------------===// // // ValuePropagation - Propagate CFG-derived value information diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -273,9 +273,9 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } -const TargetTransformInfo::MemCmpExpansionOptions * -TargetTransformInfo::enableMemCmpExpansion(bool IsZeroCmp) const { - return TTIImpl->enableMemCmpExpansion(IsZeroCmp); +TargetTransformInfo::MemCmpExpansionOptions +TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp); } bool TargetTransformInfo::enableInterleavedAccessVectorization() const { diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -22,7 +22,6 @@ EdgeBundles.cpp ExecutionDomainFix.cpp ExpandISelPseudos.cpp - ExpandMemCmp.cpp ExpandPostRAPseudos.cpp ExpandReductions.cpp FaultMaps.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -31,7 +31,6 @@ initializeEarlyMachineLICMPass(Registry); initializeEarlyTailDuplicatePass(Registry); initializeExpandISelPseudosPass(Registry); - initializeExpandMemCmpPassPass(Registry); initializeExpandPostRAPass(Registry); initializeFEntryInserterPass(Registry); initializeFinalizeMachineBundlesPass(Registry); diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -98,9 +98,6 @@ "enable-implicit-null-checks", cl::desc("Fold null checks into faulting memory operations"), cl::init(false), cl::Hidden); -static cl::opt DisableMergeICmps("disable-mergeicmps", - cl::desc("Disable MergeICmps Pass"), - cl::init(false), cl::Hidden); static cl::opt PrintLSR("print-lsr-output", cl::Hidden, cl::desc("Print LLVM IR produced by the loop-reduce pass")); static cl::opt PrintISelInput("print-isel-input", cl::Hidden, @@ -639,16 +636,6 @@ addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None) { - // The MergeICmpsPass tries to create memcmp calls by grouping sequences of - // loads and compares. ExpandMemCmpPass then tries to expand those calls - // into optimally-sized loads and compares. The transforms are enabled by a - // target lowering hook. - if (!DisableMergeICmps) - addPass(createMergeICmpsPass()); - addPass(createExpandMemCmpPass()); - } - // Run GC lowering passes for builtin collectors // TODO: add a pass insertion point here addPass(createGCLoweringPass()); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -62,8 +62,8 @@ /// @{ bool useColdCCForColdCall(Function &F); bool enableAggressiveInterleaving(bool LoopHasReductions); - const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const; + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -238,17 +238,12 @@ return LoopHasReductions; } -const PPCTTIImpl::TTI::MemCmpExpansionOptions * -PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { - static const auto Options = []() { - TTI::MemCmpExpansionOptions Options; - Options.LoadSizes.push_back(8); - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); - return Options; - }(); - return &Options; +PPCTTIImpl::TTI::MemCmpExpansionOptions +PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes = {8, 4, 2, 1}; + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + return Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -843,11 +843,6 @@ /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; - /// Allow multiple load pairs per block for smaller and faster code. - unsigned getMemcmpEqZeroLoadsPerBlock() const override { - return 2; - } - /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -195,8 +195,8 @@ bool areFunctionArgsABICompatible(const Function *Caller, const Function *Callee, SmallPtrSetImpl &Args) const; - const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( - bool IsZeroCmp) const; + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3127,38 +3127,29 @@ TM.getSubtarget(*Callee).useAVX512Regs(); } -const X86TTIImpl::TTI::MemCmpExpansionOptions * -X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { - // Only enable vector loads for equality comparison. - // Right now the vector version is not as fast, see #33329. - static const auto ThreeWayOptions = [this]() { - TTI::MemCmpExpansionOptions Options; - if (ST->is64Bit()) { - Options.LoadSizes.push_back(8); - } - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); - return Options; - }(); - static const auto EqZeroOptions = [this]() { - TTI::MemCmpExpansionOptions Options; +X86TTIImpl::TTI::MemCmpExpansionOptions +X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = 2; + if (IsZeroCmp) { + // Only enable vector loads for equality comparison. Right now the vector + // version is not as fast for three way compare (see #33329). // TODO: enable AVX512 when the DAG is ready. // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); if (ST->hasAVX2()) Options.LoadSizes.push_back(32); if (ST->hasSSE2()) Options.LoadSizes.push_back(16); - if (ST->is64Bit()) { - Options.LoadSizes.push_back(8); - } - Options.LoadSizes.push_back(4); - Options.LoadSizes.push_back(2); - Options.LoadSizes.push_back(1); // All GPR and vector loads can be unaligned. SIMD compare requires integer // vectors (SSE2/AVX2). Options.AllowOverlappingLoads = true; - return Options; - }(); - return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; + } + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; } bool X86TTIImpl::enableInterleavedAccessVectorization() { diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -253,6 +253,17 @@ PM.add(createInstructionCombiningPass(ExpensiveCombines)); } +void PassManagerBuilder::addMemcmpPasses(legacy::PassManagerBase &PM) const { + if (OptLevel > 0) { + // The MergeICmpsPass tries to create memcmp calls by grouping sequences of + // loads and compares. ExpandMemCmpPass then tries to expand those calls + // into optimally-sized loads and compares. The transforms are enabled by a + // target transform info hook. + PM.add(createMergeICmpsPass()); + PM.add(createExpandMemCmpPass()); + } +} + void PassManagerBuilder::populateFunctionPassManager( legacy::FunctionPassManager &FPM) { addExtensionsToPM(EP_EarlyAsPossible, FPM); @@ -397,6 +408,7 @@ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies } MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset + addMemcmpPasses(MPM); // Merge/Expand comparisons. MPM.add(createSCCPPass()); // Constant prop with SCCP // Delete dead bit computations (instcombine runs after to fold away the dead @@ -909,6 +921,7 @@ PM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. + addMemcmpPasses(PM); // Merge/Expand comparisons. // Nuke dead stores. PM.add(createDeadStoreEliminationPass()); diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -10,6 +10,7 @@ DeadStoreElimination.cpp DivRemPairs.cpp EarlyCSE.cpp + ExpandMemCmp.cpp FlattenCFGPass.cpp Float2Int.cpp GuardWidening.cpp diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp rename from llvm/lib/CodeGen/ExpandMemCmp.cpp rename to llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp @@ -13,13 +13,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/TargetLowering.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; @@ -38,7 +38,6 @@ namespace { - // This class provides helper functions to expand a memcmp library call into an // inline expansion. class MemCmpExpansion { @@ -67,8 +66,7 @@ // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. struct LoadEntry { LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - } + : LoadSize(LoadSize), Offset(Offset) {} // The size of the load for this block, in bytes. unsigned LoadSize; @@ -105,8 +103,7 @@ public: MemCmpExpansion(CallInst *CI, uint64_t Size, const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout); + const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout); unsigned getNumBlocks(); uint64_t getNumLoads() const { return LoadSequence.size(); } @@ -195,16 +192,10 @@ MemCmpExpansion::MemCmpExpansion( CallInst *const CI, uint64_t Size, const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlockForZeroCmp(MaxLoadsPerBlockForZeroCmp), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { + const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout) + : CI(CI), Size(Size), MaxLoadSize(0), NumLoadsNonOneByte(0), + NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) { assert(Size > 0 && "zero blocks"); // Scale the max size down if the target can load more bytes than we need. llvm::ArrayRef LoadSizes(Options.LoadSizes); @@ -215,17 +206,17 @@ MaxLoadSize = LoadSizes.front(); // Compute the decomposition. unsigned GreedyNumLoadsNonOneByte = 0; - LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads, + LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads, GreedyNumLoadsNonOneByte); NumLoadsNonOneByte = GreedyNumLoadsNonOneByte; - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); + assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant"); // If we allow overlapping loads and the load sequence is not already optimal, // use overlapping loads. if (Options.AllowOverlappingLoads && (LoadSequence.empty() || LoadSequence.size() > 2)) { unsigned OverlappingNumLoadsNonOneByte = 0; auto OverlappingLoads = computeOverlappingLoadSequence( - Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte); + Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte); if (!OverlappingLoads.empty() && (LoadSequence.empty() || OverlappingLoads.size() < LoadSequence.size())) { @@ -233,7 +224,7 @@ NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte; } } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); + assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant"); } unsigned MemCmpExpansion::getNumBlocks() { @@ -616,7 +607,8 @@ // calculate which source was larger. The calculation requires the // two loaded source values of each load compare block. // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); // Create the number of required load compare basic blocks. createLoadCmpBlocks(); @@ -717,7 +709,7 @@ /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] /// ret i32 %phi.res static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { + const DataLayout *DL) { NumMemCmpCalls++; // Early exit from expansion if -Oz. @@ -738,18 +730,15 @@ // TTI call to check if target would like to expand memcmp. Also, get the // available load sizes. const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->hasOptSize()); + auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(), + IsUsedForZeroCmp); + if (!Options) + return false; - unsigned NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences() - ? MemCmpEqZeroNumLoadsPerBlock - : TLI->getMemcmpEqZeroLoadsPerBlock(); + if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()) + Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock; - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, NumLoadsPerBlock, *DL); + MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL); // Don't expand if this will require more loads than desired by the target. if (Expansion.getNumLoads() == 0) { @@ -768,8 +757,6 @@ return true; } - - class ExpandMemCmpPass : public FunctionPass { public: static char ID; @@ -779,20 +766,14 @@ } bool runOnFunction(Function &F) override { - if (skipFunction(F)) return false; - - auto *TPC = getAnalysisIfAvailable(); - if (!TPC) { + if (skipFunction(F)) return false; - } - const TargetLowering* TL = - TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); const TargetLibraryInfo *TLI = &getAnalysis().getTLI(); const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - auto PA = runImpl(F, TLI, TTI, TL); + auto PA = runImpl(F, TLI, TTI); return !PA.areAllPreserved(); } @@ -800,23 +781,21 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, - const TargetLowering* TL); + const TargetTransformInfo *TTI); // Returns true if a change was made. bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL); + const TargetTransformInfo *TTI, const DataLayout &DL); }; -bool ExpandMemCmpPass::runOnBlock( - BasicBlock &BB, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL) { - for (Instruction& I : BB) { +bool ExpandMemCmpPass::runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const DataLayout &DL) { + for (Instruction &I : BB) { CallInst *CI = dyn_cast(&I); if (!CI) { continue; @@ -824,21 +803,20 @@ LibFunc Func; if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && - expandMemCmp(CI, TTI, TL, &DL)) { + expandMemCmp(CI, TTI, &DL)) { return true; } } return false; } - -PreservedAnalyses ExpandMemCmpPass::runImpl( - Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL) { - const DataLayout& DL = F.getParent()->getDataLayout(); +PreservedAnalyses ExpandMemCmpPass::runImpl(Function &F, + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { + const DataLayout &DL = F.getParent()->getDataLayout(); bool MadeChanges = false; for (auto BBIt = F.begin(); BBIt != F.end();) { - if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + if (runOnBlock(*BBIt, TLI, TTI, DL)) { MadeChanges = true; // If changes were made, restart the function from the beginning, since // the structure of the function was changed. @@ -847,7 +825,11 @@ ++BBIt; } } - return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); + if (!MadeChanges) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve(); + return PA; } } // namespace @@ -860,6 +842,4 @@ INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", "Expand memcmp() to load/stores", false, false) -FunctionPass *llvm::createExpandMemCmpPass() { - return new ExpandMemCmpPass(); -} +Pass *llvm::createExpandMemCmpPass() { return new ExpandMemCmpPass(); } diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -42,6 +42,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" @@ -824,6 +825,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addPreserved(); } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, @@ -837,7 +839,8 @@ // We only try merging comparisons if the target wants to expand memcmp later. // The rationale is to avoid turning small chains into memcmp calls. - if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all(); + if (!TTI->enableMemCmpExpansion(F.hasOptSize(), /*IsZeroCmp*/ true)) + return PreservedAnalyses::all(); // If we don't have memcmp avaiable we can't emit calls to it. if (!TLI->has(LibFunc_memcmp)) @@ -851,8 +854,10 @@ MadeChange |= processPhi(*Phi, TLI, AA); } - if (MadeChange) return PreservedAnalyses::none(); - return PreservedAnalyses::all(); + if (!MadeChange) return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve(); + return PA; } } // namespace diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -83,6 +83,7 @@ initializeLowerWidenableConditionLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); initializeMergeICmpsPass(Registry); + initializeExpandMemCmpPassPass(Registry); initializeMergedLoadStoreMotionLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -32,10 +32,6 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Induction Variable Users ; CHECK-NEXT: Loop Strength Reduction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Merge contiguous icmps into a memcmp -; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -16,10 +16,6 @@ ; CHECK: Loop Pass Manager ; CHECK: Induction Variable Users ; CHECK: Loop Strength Reduction -; CHECK: Basic Alias Analysis (stateless AA impl) -; CHECK: Function Alias Analysis Results -; CHECK: Merge contiguous icmps into a memcmp -; CHECK: Expand memcmp() to load/stores ; CHECK: Lower Garbage Collection Instructions ; CHECK: Shadow Stack GC Lowering ; CHECK: Remove unreachable blocks from the CFG diff --git a/llvm/test/CodeGen/Generic/llc-start-stop.ll b/llvm/test/CodeGen/Generic/llc-start-stop.ll --- a/llvm/test/CodeGen/Generic/llc-start-stop.ll +++ b/llvm/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -aa -mergeicmps +; START-AFTER: -gc-lowering ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Dominator Tree Construction +; START-AFTER-NEXT: Lower Garbage Collection Instructions ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Basic Alias Analysis (stateless AA impl) +; START-BEFORE-NEXT: Lower Garbage Collection Instructions ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll b/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll deleted file mode 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll +++ /dev/null @@ -1,40 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=PPC64LE - -; This tests interaction between MergeICmp and ExpandMemCmp. - -%"struct.std::pair" = type { i32, i32 } - -define zeroext i1 @opeq1( -; PPC64LE-LABEL: opeq1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: ld 3, 0(3) -; PPC64LE-NEXT: ld 4, 0(4) -; PPC64LE-NEXT: xor 3, 3, 4 -; PPC64LE-NEXT: cntlzd 3, 3 -; PPC64LE-NEXT: rldicl 3, 3, 58, 63 -; PPC64LE-NEXT: blr - %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, - %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { -entry: - %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 - %0 = load i32, i32* %first.i, align 4 - %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 - %1 = load i32, i32* %first1.i, align 4 - %cmp.i = icmp eq i32 %0, %1 - br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit - -land.rhs.i: - %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 - %2 = load i32, i32* %second.i, align 4 - %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 - %3 = load i32, i32* %second2.i, align 4 - %cmp3.i = icmp eq i32 %2, %3 - br label %opeq1.exit - -opeq1.exit: - %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] - ret i1 %4 -} - - diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll --- a/llvm/test/CodeGen/X86/O3-pipeline.ll +++ b/llvm/test/CodeGen/X86/O3-pipeline.ll @@ -29,10 +29,6 @@ ; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Induction Variable Users ; CHECK-NEXT: Loop Strength Reduction -; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Merge contiguous icmps into a memcmp -; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG diff --git a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll b/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll deleted file mode 100644 --- a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll +++ /dev/null @@ -1,51 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 - -; This tests interaction between MergeICmp and ExpandMemCmp. - -%"struct.std::pair" = type { i32, i32 } - -define zeroext i1 @opeq1( -; X86-LABEL: opeq1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: retl -; -; X64-LABEL: opeq1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: sete %al -; X64-NEXT: retq - %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, - %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { -entry: - %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 - %0 = load i32, i32* %first.i, align 4 - %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 - %1 = load i32, i32* %first1.i, align 4 - %cmp.i = icmp eq i32 %0, %1 - br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit - -land.rhs.i: - %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 - %2 = load i32, i32* %second.i, align 4 - %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 - %3 = load i32, i32* %second2.i, align 4 - %cmp3.i = icmp eq i32 %2, %3 - br label %opeq1.exit - -opeq1.exit: - %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] - ret i1 %4 -} - - diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll deleted file mode 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ /dev/null @@ -1,1013 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 - -; This tests codegen time inlining/optimization of memcmp -; rdar://6480398 - -@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 - -declare i32 @memcmp(i8*, i8*, i64) -declare i32 @bcmp(i8*, i8*, i64) - -define i32 @length2(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; -; X64-LABEL: length2: -; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx -; X64-NEXT: rolw $8, %ax -; X64-NEXT: rolw $8, %cx -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind - ret i32 %m -} - -define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length2_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx -; X86-NEXT: sete %al -; X86-NEXT: retl -; -; X64-LABEL: length2_eq: -; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: sete %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind - %c = icmp eq i32 %m, 0 - ret i1 %c -} - -define i1 @length2_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length2_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-LABEL: length2_eq_const: -; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: cmpl $12849, %eax # imm = 0x3231 -; X64-NEXT: setne %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length2_eq_nobuiltin_attr: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; -; X64-LABEL: length2_eq_nobuiltin_attr: -; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $2, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin - %c = icmp eq i32 %m, 0 - ret i1 %c -} - -define i32 @length3(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length3: -; X86: # %bb.0: # %loadbb -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: rolw $8, %dx -; X86-NEXT: rolw $8, %si -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: jne .LBB4_1 -; X86-NEXT: # %bb.2: # %loadbb1 -; X86-NEXT: movzbl 2(%eax), %eax -; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB4_3 -; X86-NEXT: .LBB4_1: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB4_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: length3: -; X64: # %bb.0: # %loadbb -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx -; X64-NEXT: rolw $8, %ax -; X64-NEXT: rolw $8, %cx -; X64-NEXT: cmpw %cx, %ax -; X64-NEXT: jne .LBB4_1 -; X64-NEXT: # %bb.2: # %loadbb1 -; X64-NEXT: movzbl 2(%rdi), %eax -; X64-NEXT: movzbl 2(%rsi), %ecx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB4_1: # %res_block -; X64-NEXT: setae %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind - ret i32 %m -} - -define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length3_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orw %dx, %ax -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-LABEL: length3_eq: -; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: xorw (%rsi), %ax -; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx -; X64-NEXT: setne %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i32 @length4(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length4: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: retl -; -; X64-LABEL: length4: -; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx -; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind - ret i32 %m -} - -define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length4_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-LABEL: length4_eq: -; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: setne %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i1 @length4_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length4_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 -; X86-NEXT: sete %al -; X86-NEXT: retl -; -; X64-LABEL: length4_eq_const: -; X64: # %bb.0: -; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 -; X64-NEXT: sete %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind - %c = icmp eq i32 %m, 0 - ret i1 %c -} - -define i32 @length5(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length5: -; X86: # %bb.0: # %loadbb -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB9_1 -; X86-NEXT: # %bb.2: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB9_3 -; X86-NEXT: .LBB9_1: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB9_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: length5: -; X64: # %bb.0: # %loadbb -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl (%rsi), %ecx -; X64-NEXT: bswapl %eax -; X64-NEXT: bswapl %ecx -; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: jne .LBB9_1 -; X64-NEXT: # %bb.2: # %loadbb1 -; X64-NEXT: movzbl 4(%rdi), %eax -; X64-NEXT: movzbl 4(%rsi), %ecx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB9_1: # %res_block -; X64-NEXT: setae %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind - ret i32 %m -} - -define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length5_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-LABEL: length5_eq: -; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: xorl (%rsi), %eax -; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: setne %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i32 @length8(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length8: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB11_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: length8: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind - ret i32 %m -} - -define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length8_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: retl -; -; X64-LABEL: length8_eq: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: sete %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind - %c = icmp eq i32 %m, 0 - ret i1 %c -} - -define i1 @length8_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length8_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NEXT: xorl (%eax), %ecx -; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NEXT: xorl 4(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-LABEL: length8_eq_const: -; X64: # %bb.0: -; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 -; X64-NEXT: cmpq %rax, (%rdi) -; X64-NEXT: setne %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length12_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-LABEL: length12_eq: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: xorq (%rsi), %rax -; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: setne %al -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i32 @length12(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length12: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; -; X64-LABEL: length12: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_2 -; X64-NEXT: # %bb.1: # %loadbb1 -; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: movl 8(%rsi), %edx -; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB15_3 -; X64-NEXT: .LBB15_2: # %res_block -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB15_3: # %endblock -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind - ret i32 %m -} - -; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 - -define i32 @length16(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length16: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; -; X64-LABEL: length16: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_2 -; X64-NEXT: # %bb.1: # %loadbb1 -; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_2: # %res_block -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB16_3: # %endblock -; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind - ret i32 %m -} - -define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind - %cmp = icmp ne i32 %call, 0 - ret i1 %cmp -} - -define i1 @length16_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq_const: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind - %c = icmp eq i32 %m, 0 - ret i1 %c -} - -; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 - -define i32 @length24(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length24: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; -; X64-LABEL: length24: -; X64: # %bb.0: -; X64-NEXT: movl $24, %edx -; X64-NEXT: jmp memcmp # TAILCALL - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind - ret i32 %m -} - -define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pmovmskb %xmm2, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X64-AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind - %cmp = icmp eq i32 %call, 0 - ret i1 %cmp -} - -define i1 @length24_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i32 @length32(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length32: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; -; X64-LABEL: length32: -; X64: # %bb.0: -; X64-NEXT: movl $32, %edx -; X64-NEXT: jmp memcmp # TAILCALL - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind - ret i32 %m -} - -; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 - -define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length32_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: movdqu (%rsi), %xmm2 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind - %cmp = icmp eq i32 %call, 0 - ret i1 %cmp -} - -define i1 @length32_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length32_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind - %c = icmp ne i32 %m, 0 - ret i1 %c -} - -define i32 @length64(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length64: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; -; X64-LABEL: length64: -; X64: # %bb.0: -; X64-NEXT: movl $64, %edx -; X64-NEXT: jmp memcmp # TAILCALL - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind - ret i32 %m -} - -define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { -; X86-LABEL: length64_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; -; X64-SSE2-LABEL: length64_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $64, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq - %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind - %cmp = icmp ne i32 %call, 0 - ret i1 %cmp -} - -define i1 @length64_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length64_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; -; X64-SSE2-LABEL: length64_eq_const: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $64, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind - %c = icmp eq i32 %m, 0 - ret i1 %c -} - -define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: bcmp_length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; -; X64-LABEL: bcmp_length2: -; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx -; X64-NEXT: rolw $8, %ax -; X64-NEXT: rolw $8, %cx -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq - %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind - ret i32 %m -} - diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -1,10 +1,10 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE1 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: opt -O3 -mtriple=i686-unknown-unknown -mattr=cmov < %s | llc -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: opt -O3 -mtriple=i686-unknown-unknown -mattr=+sse < %s | llc -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE1 +; RUN: opt -O3 -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | llc -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE2 +; RUN: opt -O3 -mtriple=x86_64-unknown-unknown < %s | llc -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: opt -Os -mtriple=x86_64-unknown-unknown < %s | llc -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-OPTSIZE +; RUN: opt -O3 -mtriple=x86_64-unknown-unknown -mattr=avx < %s | llc -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: opt -O3 -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | llc -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -12,6 +12,7 @@ @.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) define i32 @length0(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length0: @@ -112,15 +113,11 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NEXT: rolw $8, %ax +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: setb %al ; ; X64-LABEL: length2_lt: ; X64: # %bb.0: @@ -128,11 +125,8 @@ ; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax ; X64-NEXT: rolw $8, %cx -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: shrl $31, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: setb %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind %c = icmp slt i32 %m, 0 @@ -148,12 +142,8 @@ ; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: rolw $8, %cx ; X86-NEXT: rolw $8, %ax -; X86-NEXT: movzwl %cx, %ecx -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: setg %al -; X86-NEXT: retl +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: seta %al ; ; X64-LABEL: length2_gt: ; X64: # %bb.0: @@ -161,11 +151,8 @@ ; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax ; X64-NEXT: rolw $8, %cx -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: subl %ecx, %eax -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setg %al +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: seta %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind %c = icmp sgt i32 %m, 0 @@ -223,34 +210,33 @@ ; X86-LABEL: length3: ; X86: # %bb.0: # %loadbb ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: rolw $8, %dx -; X86-NEXT: rolw $8, %si -; X86-NEXT: cmpw %si, %dx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movzwl (%esi), %eax +; X86-NEXT: movzwl (%edx), %ecx +; X86-NEXT: cmpw %cx, %ax ; X86-NEXT: jne .LBB9_1 ; X86-NEXT: # %bb.2: # %loadbb1 -; X86-NEXT: movzbl 2(%eax), %eax -; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: movzbl 2(%esi), %eax +; X86-NEXT: movzbl 2(%edx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB9_1: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: popl %esi +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %ax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: setae %dl +; X86-NEXT: leal -1(%edx,%edx), %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length3: ; X64: # %bb.0: # %loadbb ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: movzwl (%rsi), %ecx -; X64-NEXT: rolw $8, %ax -; X64-NEXT: rolw $8, %cx -; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: cmpw %cx, %ax ; X64-NEXT: jne .LBB9_1 ; X64-NEXT: # %bb.2: # %loadbb1 ; X64-NEXT: movzbl 2(%rdi), %eax @@ -258,9 +244,12 @@ ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB9_1: # %res_block -; X64-NEXT: setae %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: setae %dl +; X64-NEXT: leal -1(%rdx,%rdx), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind ret i32 %m @@ -352,29 +341,21 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: length4_lt: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx -; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax -; X64-NEXT: shrl $31, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: setb %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind %c = icmp slt i32 %m, 0 @@ -390,12 +371,8 @@ ; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %eax -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: seta %dl -; X86-NEXT: sbbl $0, %edx -; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %al +; X86-NEXT: seta %al ; X86-NEXT: retl ; ; X64-LABEL: length4_gt: @@ -404,12 +381,8 @@ ; X64-NEXT: movl (%rsi), %ecx ; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: seta %dl -; X64-NEXT: sbbl $0, %edx -; X64-NEXT: testl %edx, %edx -; X64-NEXT: setg %al +; X64-NEXT: seta %al ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind %c = icmp sgt i32 %m, 0 @@ -438,24 +411,25 @@ ; X86-LABEL: length5: ; X86: # %bb.0: # %loadbb ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %eax +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: jne .LBB16_1 ; X86-NEXT: # %bb.2: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: movzbl 4(%esi), %eax +; X86-NEXT: movzbl 4(%edx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB16_1: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: setae %dl +; X86-NEXT: leal -1(%edx,%edx), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -463,8 +437,6 @@ ; X64: # %bb.0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl (%rsi), %ecx -; X64-NEXT: bswapl %eax -; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: jne .LBB16_1 ; X64-NEXT: # %bb.2: # %loadbb1 @@ -473,9 +445,12 @@ ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB16_1: # %res_block -; X64-NEXT: setae %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: setae %dl +; X64-NEXT: leal -1(%rdx,%rdx), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind ret i32 %m @@ -514,23 +489,24 @@ ; X86-LABEL: length5_lt: ; X86: # %bb.0: # %loadbb ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %eax +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: jne .LBB18_1 ; X86-NEXT: # %bb.2: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: movzbl 4(%esi), %eax +; X86-NEXT: movzbl 4(%edx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: jmp .LBB18_3 ; X86-NEXT: .LBB18_1: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: setae %dl +; X86-NEXT: leal -1(%edx,%edx), %eax ; X86-NEXT: .LBB18_3: # %endblock ; X86-NEXT: shrl $31, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax @@ -541,8 +517,6 @@ ; X64: # %bb.0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl (%rsi), %ecx -; X64-NEXT: bswapl %eax -; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: jne .LBB18_1 ; X64-NEXT: # %bb.2: # %loadbb1 @@ -550,12 +524,15 @@ ; X64-NEXT: movzbl 4(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: shrl $31, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; X64-NEXT: .LBB18_1: # %res_block -; X64-NEXT: setae %al -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: setae %dl +; X64-NEXT: leal -1(%rdx,%rdx), %eax ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -599,25 +576,25 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl (%esi), %ecx ; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB20_2 -; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: jne .LBB20_1 +; X86-NEXT: # %bb.2: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB20_3 -; X86-NEXT: .LBB20_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB20_3: # %endblock -; X86-NEXT: popl %esi +; X86-NEXT: jne .LBB20_1 +; X86-NEXT: # %bb.3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB20_1: +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length8: @@ -807,23 +784,26 @@ ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB27_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: jne .LBB27_1 +; X64-NEXT: # %bb.4: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx -; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: je .LBB27_3 +; X64-NEXT: # %bb.5: +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: jmp .LBB27_2 +; X64-NEXT: .LBB27_1: +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx ; X64-NEXT: .LBB27_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: setae %al +; X64-NEXT: leal -1(%rax,%rax), %eax ; X64-NEXT: .LBB27_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind @@ -928,25 +908,23 @@ ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB31_2 -; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: jne .LBB31_1 +; X64-NEXT: # %bb.2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx -; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB31_3 -; X64-NEXT: .LBB31_2: # %res_block +; X64-NEXT: jne .LBB31_1 +; X64-NEXT: # %bb.3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB31_1: +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB31_3: # %endblock -; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m } @@ -1130,32 +1108,19 @@ ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pmovmskb %xmm2, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq +; X64-LABEL: length24_eq: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorq 8(%rsi), %rcx +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: movq 16(%rdi), %rdx +; X64-NEXT: xorq 16(%rsi), %rdx +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: sete %al +; X64-NEXT: retq ; -; X64-AVX-LABEL: length24_eq: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X64-AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: sete %al -; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -1199,29 +1164,19 @@ ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq +; X64-LABEL: length24_eq_const: +; X64: # %bb.0: +; X64-NEXT: movabsq $3833745473465760056, %rax # imm = 0x3534333231303938 +; X64-NEXT: xorq 8(%rdi), %rax +; X64-NEXT: movabsq $3978425819141910832, %rcx # imm = 0x3736353433323130 +; X64-NEXT: xorq (%rdi), %rcx +; X64-NEXT: movabsq $3689065127958034230, %rdx # imm = 0x3332313039383736 +; X64-NEXT: xorq 16(%rdi), %rdx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: setne %al +; X64-NEXT: retq ; -; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1 -; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: setne %al -; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -1307,9 +1262,9 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; X64-AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX1-NEXT: sete %al @@ -1513,9 +1468,9 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax ; X64-AVX2-NEXT: sete %al @@ -1568,7 +1523,6 @@ ; X64-NEXT: sete %al ; X64-NEXT: popq %rcx ; X64-NEXT: retq - %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -1612,3 +1566,25 @@ %c = icmp eq i32 %m, 0 ret i1 %c } + +define i1 @bcmp_length2(i8* %X, i8* %Y) nounwind { +; X86-LABEL: bcmp_length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: bcmp_length2: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + diff --git a/llvm/test/CodeGen/X86/pr36421.ll b/llvm/test/CodeGen/X86/pr36421.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr36421.ll @@ -0,0 +1,65 @@ +; RUN: opt -O3 -extra-vectorizer-passes -mtriple=x86_64-unknown-unknown < %s | llc -mtriple=x86_64-unknown-unknown | FileCheck %s + +@.str = private unnamed_addr constant [7 x i8] c"abcdef\00", align 1 +@.str.1 = private unnamed_addr constant [7 x i8] c"ABCDEF\00", align 1 + +define i32 @test(i8* nocapture readonly %string, i32 %len) local_unnamed_addr #0 { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $6, %esi +; CHECK-NEXT: jne .LBB0_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movl (%rdi), %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: xorl $1684234849, %eax # imm = 0x64636261 +; CHECK-NEXT: movzwl 4(%rdi), %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: xorl $26213, %esi +; CHECK-NEXT: orl %eax, %esi +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: xorl $1145258561, %ecx +; CHECK-NEXT: xorl $17989, %ed +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: sete %al +; CHECK-NEXT: shll $6, %eax +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: movl $61, %eax +; CHECK-NEXT: retq +; +entry: + %cond = icmp eq i32 %len, 6 + br i1 %cond, label %sw.bb, label %return + +sw.bb: ; preds = %entry + %call = tail call i32 @memcmp(i8* %string, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i64 0, i64 0), i64 6) + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %sw.bb + %call1 = tail call i32 @memcmp(i8* %string, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.1, i64 0, i64 0), i64 6) + %cmp2 = icmp eq i32 %call1, 0 + %. = select i1 %cmp2, i32 64, i32 0 + br label %return + +return: ; preds = %entry, %if.end8, %if.end4, %if.end, %sw.bb + %retval.0 = phi i32 [ 61, %sw.bb ], [ %., %if.end ], [ 0, %entry ] + ret i32 %retval.0 +} + +; Function Attrs: nounwind readonly +declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +attributes #0 = { nounwind readonly ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 7.0.0 (trunk 325350)"} diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -135,7 +135,12 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: MemCpy Optimization +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Sparse Conditional Constant Propagation +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Demanded bits analysis ; CHECK-NEXT: Bit-Tracking Dead Code Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -140,7 +140,12 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: MemCpy Optimization +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Sparse Conditional Constant Propagation +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Demanded bits analysis ; CHECK-NEXT: Bit-Tracking Dead Code Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -122,7 +122,12 @@ ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: MemCpy Optimization +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Merge contiguous icmps into a memcmp +; CHECK-NEXT: Expand memcmp() to load/stores ; CHECK-NEXT: Sparse Conditional Constant Propagation +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Demanded bits analysis ; CHECK-NEXT: Bit-Tracking Dead Code Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mergeicmps -expandmemcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux < %s | FileCheck %s --check-prefix=PPC64LE + +; This tests interaction between MergeICmp and ExpandMemCmp. + +%"struct.std::pair" = type { i32, i32 } + +define zeroext i1 @opeq1( +; PPC64LE-LABEL: @opeq1( +; PPC64LE-NEXT: entry: +; PPC64LE-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; PPC64LE-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; PPC64LE-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; PPC64LE-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; PPC64LE-NEXT: [[TMP0:%.*]] = bitcast i8* [[CSTR]] to i64* +; PPC64LE-NEXT: [[TMP1:%.*]] = bitcast i8* [[CSTR1]] to i64* +; PPC64LE-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; PPC64LE-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; PPC64LE-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; PPC64LE-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32 +; PPC64LE-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; PPC64LE-NEXT: br label [[OPEQ1_EXIT:%.*]] +; PPC64LE: opeq1.exit: +; PPC64LE-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP6]], [[ENTRY:%.*]] ] +; PPC64LE-NEXT: ret i1 [[TMP7]] +; + %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp3.i = icmp eq i32 %2, %3 + br label %opeq1.exit + +opeq1.exit: + %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] + ret i1 %4 +} + + diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-mergeexpand.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-mergeexpand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-mergeexpand.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mergeicmps -expandmemcmp -mtriple=i386-unknown-linux < %s | FileCheck %s --check-prefix=X86 +; RUN: opt -S -mergeicmps -expandmemcmp -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X64 + +; This tests interaction between MergeICmp and ExpandMemCmp. + +%"struct.std::pair" = type { i32, i32 } + +define zeroext i1 @opeq1( +; X86-LABEL: @opeq1( +; X86-NEXT: entry: +; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; X86-NEXT: [[TMP0:%.*]] = bitcast i8* [[CSTR]] to i32* +; X86-NEXT: [[TMP1:%.*]] = bitcast i8* [[CSTR1]] to i32* +; X86-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X86-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X86-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], [[TMP3]] +; X86-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[CSTR]], i8 4 +; X86-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to i32* +; X86-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[CSTR1]], i8 4 +; X86-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32* +; X86-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP6]] +; X86-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X86-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], [[TMP10]] +; X86-NEXT: [[TMP12:%.*]] = or i32 [[TMP4]], [[TMP11]] +; X86-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0 +; X86-NEXT: [[TMP14:%.*]] = zext i1 [[TMP13]] to i32 +; X86-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0 +; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] +; X86: opeq1.exit: +; X86-NEXT: [[TMP16:%.*]] = phi i1 [ [[TMP15]], [[ENTRY:%.*]] ] +; X86-NEXT: ret i1 [[TMP16]] +; +; X64-LABEL: @opeq1( +; X64-NEXT: entry: +; X64-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0 +; X64-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0 +; X64-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; X64-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[CSTR]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[CSTR1]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: [[TMP5:%.*]] = zext i1 [[TMP4]] to i32 +; X64-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0 +; X64-NEXT: br label [[OPEQ1_EXIT:%.*]] +; X64: opeq1.exit: +; X64-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP6]], [[ENTRY:%.*]] ] +; X64-NEXT: ret i1 [[TMP7]] +; + %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, + %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: + %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 + %0 = load i32, i32* %first.i, align 4 + %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 + %1 = load i32, i32* %first1.i, align 4 + %cmp.i = icmp eq i32 %0, %1 + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: + %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 + %2 = load i32, i32* %second.i, align 4 + %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 + %3 = load i32, i32* %second2.i, align 4 + %cmp3.i = icmp eq i32 %2, %3 + br label %opeq1.exit + +opeq1.exit: + %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] + ret i1 %4 +} + + diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -510,7 +510,6 @@ initializeTarget(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. - initializeExpandMemCmpPassPass(Registry); initializeScalarizeMaskedMemIntrinPass(Registry); initializeCodeGenPreparePass(Registry); initializeAtomicExpandPass(Registry); diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -40,7 +40,6 @@ "EdgeBundles.cpp", "ExecutionDomainFix.cpp", "ExpandISelPseudos.cpp", - "ExpandMemCmp.cpp", "ExpandPostRAPseudos.cpp", "ExpandReductions.cpp", "FEntryInserter.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn @@ -21,6 +21,7 @@ "DeadStoreElimination.cpp", "DivRemPairs.cpp", "EarlyCSE.cpp", + "ExpandMemCmp.cpp", "FlattenCFGPass.cpp", "Float2Int.cpp", "GVN.cpp",