Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -554,8 +554,13 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; - /// \brief Enable inline expansion of memcmp - bool enableMemCmpExpansion(unsigned &MaxLoadSize) const; + /// \brief If not nullptr, enable inline expansion of memcmp. IsZeroCmp is + /// true if this is the expansion of memcmp(p1, p2, s) == 0. + struct MemCmpExpansionOptions { + // The list of available load sizes (in bytes), sorted in decreasing order. + SmallVector LoadSizes; + }; + const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const; /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -993,7 +998,8 @@ unsigned VF) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; - virtual bool enableMemCmpExpansion(unsigned &MaxLoadSize) = 0; + virtual const MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -1246,8 +1252,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } - bool enableMemCmpExpansion(unsigned &MaxLoadSize) override { - return Impl.enableMemCmpExpansion(MaxLoadSize); + const MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const override { + return Impl.enableMemCmpExpansion(IsZeroCmp); } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -294,7 +294,10 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } - bool enableMemCmpExpansion(unsigned &MaxLoadSize) { return false; } + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const { + return nullptr; + } bool enableInterleavedAccessVectorization() { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -250,8 +250,9 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } -bool TargetTransformInfo::enableMemCmpExpansion(unsigned &MaxLoadSize) const { - return TTIImpl->enableMemCmpExpansion(MaxLoadSize); +const TargetTransformInfo::MemCmpExpansionOptions * +TargetTransformInfo::enableMemCmpExpansion(bool IsZeroCmp) const { + return TTIImpl->enableMemCmpExpansion(IsZeroCmp); } bool TargetTransformInfo::enableInterleavedAccessVectorization() const { Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -1758,9 +1758,10 @@ Value *getMemCmpOneBlock(); public: - MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, - unsigned MaxNumLoads, unsigned NumLoadsPerBlock, - const DataLayout &DL); + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); unsigned getNumBlocks(); uint64_t getNumLoads() const { return LoadSequence.size(); } @@ -1778,29 +1779,32 @@ // return from. // 3. ResultBlock, block to branch to for early exit when a // LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size, - const unsigned MaxLoadSize, - const unsigned MaxNumLoads, - const unsigned LoadsPerBlock, - const DataLayout &TheDataLayout) +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) : CI(CI), Size(Size), - MaxLoadSize(MaxLoadSize), + MaxLoadSize(0), NumLoadsNonOneByte(0), - NumLoadsPerBlock(LoadsPerBlock), - IsUsedForZeroCmp(isOnlyUsedInZeroEqualityComparison(CI)), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) { assert(Size > 0 && "zero blocks"); // Scale the max size down if the target can load more bytes than we need. - while (this->MaxLoadSize > Size) { - this->MaxLoadSize /= 2; + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; // Compute the decomposition. - unsigned LoadSize = this->MaxLoadSize; uint64_t CurSize = Size; uint64_t Offset = 0; - while (CurSize) { + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; assert(LoadSize > 0 && "zero load size"); const uint64_t NumLoadsForThisSize = CurSize / LoadSize; if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { @@ -1821,11 +1825,7 @@ } CurSize = CurSize % LoadSize; } - // FIXME: This can result in a non-native load size (e.g. X86-32+SSE can - // load 16 and 4 but not 8), which throws the load count off (e.g. in the - // aforementioned case, 16 bytes will count for 2 loads but will generate - // 4). - LoadSize /= 2; + ++LoadSizeIndex; } assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); } @@ -2362,15 +2362,16 @@ } // TTI call to check if target would like to expand memcmp. Also, get the - // max LoadSize. - unsigned MaxLoadSize; - if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return false; + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; const unsigned MaxNumLoads = TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MaxNumLoads, - MemCmpNumLoadsPerBlock, *DL); + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); // Don't expand if this will require more loads than desired by the target. if (Expansion.getNumLoads() == 0) { Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -63,7 +63,8 @@ /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); - bool enableMemCmpExpansion(unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -226,9 +226,17 @@ return LoopHasReductions; } -bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { - MaxLoadSize = 8; - return true; +const PPCTTIImpl::TTI::MemCmpExpansionOptions * +PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { + static const auto Options = []() { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes.push_back(8); + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return &Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -127,7 +127,8 @@ bool hasDivRemOp(Type *DataType, bool IsSigned); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - bool enableMemCmpExpansion(unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2536,10 +2536,35 @@ return (CallerBits & CalleeBits) == CalleeBits; } -bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { - // TODO: We can increase these based on available vector ops. - MaxLoadSize = ST->is64Bit() ? 8 : 4; - return true; +const X86TTIImpl::TTI::MemCmpExpansionOptions * +X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { + // Only enable vector loads for equality comparison, as we don't have a + // vector bswap. + static const auto ThreeWayOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + static const auto EqZeroOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + // TODO: enable AVX512 when the DAG is ready. + // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); + if (ST->hasAVX2()) Options.LoadSizes.push_back(32); + if (ST->hasSSE2()) Options.LoadSizes.push_back(16); + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; } bool X86TTIImpl::enableInterleavedAccessVectorization() { Index: lib/Transforms/Scalar/MergeICmps.cpp =================================================================== --- lib/Transforms/Scalar/MergeICmps.cpp +++ lib/Transforms/Scalar/MergeICmps.cpp @@ -625,8 +625,7 @@ // We only try merging comparisons if the target wants to expand memcmp later. // The rationale is to avoid turning small chains into memcmp calls. - unsigned MaxLoadSize; - if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return PreservedAnalyses::all(); + if (!TTI->enableMemCmpExpansion(false)) return PreservedAnalyses::all(); bool MadeChange = false; Index: test/CodeGen/X86/memcmp-optsize.ll =================================================================== --- test/CodeGen/X86/memcmp-optsize.ll +++ test/CodeGen/X86/memcmp-optsize.ll @@ -598,22 +598,24 @@ ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; -; X64-LABEL: length16_eq: -; X64: # BB#0: # %loadbb -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq 8(%rsi), %rcx -; X64-NEXT: je .LBB17_3 -; X64-NEXT: .LBB17_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB17_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -642,22 +644,23 @@ ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; -; X64-LABEL: length16_eq_const: -; X64: # BB#0: # %loadbb -; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 -; X64-NEXT: cmpq %rax, (%rdi) -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 -; X64-NEXT: cmpq %rcx, 8(%rdi) -; X64-NEXT: je .LBB18_3 -; X64-NEXT: .LBB18_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB18_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -697,15 +700,44 @@ ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: length24_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB20_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movq 16(%rdi), %rcx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx +; X64-SSE2-NEXT: je .LBB20_3 +; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB20_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length24_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: jne .LBB20_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: movq 16(%rdi), %rcx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx +; X64-AVX2-NEXT: je .LBB20_3 +; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB20_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -724,16 +756,43 @@ ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-LABEL: length24_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq_const: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB21_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) +; X64-SSE2-NEXT: je .LBB21_3 +; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB21_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length24_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: jne .LBB21_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) +; X64-AVX2-NEXT: je .LBB21_3 +; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB21_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -761,26 +820,65 @@ ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { -; X86-LABEL: length32_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %edx +; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB23_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB23_3 +; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: incl %eax +; X86-SSE2-NEXT: .LBB23_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB23_3 +; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length32_eq: @@ -798,27 +896,60 @@ } define i1 @length32_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length32_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB24_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB24_3 +; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: incl %eax +; X86-SSE2-NEXT: .LBB24_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB24_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB24_3 +; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length32_eq_const: @@ -867,15 +998,37 @@ ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-LABEL: length64_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB26_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB26_3 +; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB26_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -894,16 +1047,38 @@ ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: length64_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB27_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB27_3 +; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB27_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -639,22 +639,24 @@ ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; -; X64-LABEL: length16_eq: -; X64: # BB#0: # %loadbb -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB19_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq 8(%rsi), %rcx -; X64-NEXT: je .LBB19_3 -; X64-NEXT: .LBB19_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB19_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length16_eq: +; X64-AVX: # BB#0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -695,22 +697,23 @@ ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; -; X64-LABEL: length16_eq_const: -; X64: # BB#0: # %loadbb -; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 -; X64-NEXT: cmpq %rax, (%rdi) -; X64-NEXT: jne .LBB20_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 -; X64-NEXT: cmpq %rcx, 8(%rdi) -; X64-NEXT: je .LBB20_3 -; X64-NEXT: .LBB20_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB20_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length16_eq_const: +; X64-AVX: # BB#0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -750,15 +753,44 @@ ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: length24_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB22_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movq 16(%rdi), %rcx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx +; X64-SSE2-NEXT: je .LBB22_3 +; X64-SSE2-NEXT: .LBB22_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB22_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length24_eq: +; X64-AVX: # BB#0: # %loadbb +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: jne .LBB22_1 +; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: movq 16(%rdi), %rcx +; X64-AVX-NEXT: xorl %eax, %eax +; X64-AVX-NEXT: cmpq 16(%rsi), %rcx +; X64-AVX-NEXT: je .LBB22_3 +; X64-AVX-NEXT: .LBB22_1: # %res_block +; X64-AVX-NEXT: movl $1, %eax +; X64-AVX-NEXT: .LBB22_3: # %endblock +; X64-AVX-NEXT: testl %eax, %eax +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -777,16 +809,43 @@ ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-LABEL: length24_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq_const: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) +; X64-SSE2-NEXT: je .LBB23_3 +; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB23_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length24_eq_const: +; X64-AVX: # BB#0: # %loadbb +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX-NEXT: jne .LBB23_1 +; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: xorl %eax, %eax +; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) +; X64-AVX-NEXT: je .LBB23_3 +; X64-AVX-NEXT: .LBB23_1: # %res_block +; X64-AVX-NEXT: movl $1, %eax +; X64-AVX-NEXT: .LBB23_3: # %endblock +; X64-AVX-NEXT: testl %eax, %eax +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -814,41 +873,96 @@ ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length32_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq: +; X86-SSE1: # BB#0: +; X86-SSE1-NEXT: pushl $0 +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $16, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %edx +; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB25_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB25_3 +; X86-SSE2-NEXT: .LBB25_1: # %res_block +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: .LBB25_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB25_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB25_3 +; X64-SSE2-NEXT: .LBB25_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: movq 16(%rdi), %rax -; X64-AVX1-NEXT: movq (%rdi), %rcx -; X64-AVX1-NEXT: movq 8(%rdi), %rdx -; X64-AVX1-NEXT: movq 24(%rdi), %rdi -; X64-AVX1-NEXT: xorq 24(%rsi), %rdi -; X64-AVX1-NEXT: xorq 8(%rsi), %rdx -; X64-AVX1-NEXT: orq %rdi, %rdx -; X64-AVX1-NEXT: xorq 16(%rsi), %rax -; X64-AVX1-NEXT: xorq (%rsi), %rcx -; X64-AVX1-NEXT: orq %rax, %rcx -; X64-AVX1-NEXT: orq %rdx, %rcx +; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX1-NEXT: jne .LBB25_1 +; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 +; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-AVX1-NEXT: je .LBB25_3 +; X64-AVX1-NEXT: .LBB25_1: # %res_block +; X64-AVX1-NEXT: movl $1, %eax +; X64-AVX1-NEXT: .LBB25_3: # %endblock +; X64-AVX1-NEXT: testl %eax, %eax ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: retq ; @@ -867,42 +981,91 @@ } define i1 @length32_eq_const(i8* %X) nounwind { -; X86-LABEL: length32_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq_const: +; X86-SSE1: # BB#0: +; X86-SSE1-NEXT: pushl $0 +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $16, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB26_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB26_3 +; X86-SSE2-NEXT: .LBB26_1: # %res_block +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: .LBB26_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB26_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB26_3 +; X64-SSE2-NEXT: .LBB26_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: movabsq $3544395820347831604, %rax # imm = 0x3130393837363534 -; X64-AVX1-NEXT: xorq 24(%rdi), %rax -; X64-AVX1-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 -; X64-AVX1-NEXT: xorq 8(%rdi), %rcx -; X64-AVX1-NEXT: orq %rax, %rcx -; X64-AVX1-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 -; X64-AVX1-NEXT: xorq 16(%rdi), %rax -; X64-AVX1-NEXT: movabsq $3978425819141910832, %rdx # imm = 0x3736353433323130 -; X64-AVX1-NEXT: xorq (%rdi), %rdx -; X64-AVX1-NEXT: orq %rax, %rdx -; X64-AVX1-NEXT: orq %rcx, %rdx +; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX1-NEXT: jne .LBB26_1 +; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 +; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-AVX1-NEXT: je .LBB26_3 +; X64-AVX1-NEXT: .LBB26_1: # %res_block +; X64-AVX1-NEXT: movl $1, %eax +; X64-AVX1-NEXT: .LBB26_3: # %endblock +; X64-AVX1-NEXT: testl %eax, %eax ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: retq ; @@ -952,15 +1115,47 @@ ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-LABEL: length64_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length64_eq: +; X64-AVX1: # BB#0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl $64, %edx +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB28_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB28_3 +; X64-AVX2-NEXT: .LBB28_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB28_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -979,16 +1174,49 @@ ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: length64_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length64_eq_const: +; X64-AVX1: # BB#0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl $.L.str, %esi +; X64-AVX1-NEXT: movl $64, %edx +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB29_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB29_3 +; X64-AVX2-NEXT: .LBB29_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB29_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c Index: test/Transforms/CodeGenPrepare/X86/memcmp.ll =================================================================== --- test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -753,27 +753,13 @@ ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq16( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1 -; X64-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1 -; X64-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* +; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 ; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X64-NEXT: ret i32 [[CONV]] ;