Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -547,8 +547,13 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; - /// \brief Enable inline expansion of memcmp - bool enableMemCmpExpansion(unsigned &MaxLoadSize) const; + /// \brief If not nullptr, enable inline expansion of memcmp. IsThreeWay is + /// false if this is the expansion of memcmp(p1, p2, s) == 0. + struct MemCmpExpansionOptions { + // The list of available load sizes (in bytes), sorted in decreasing order. + SmallVector LoadSizes; + }; + const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const; /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -985,7 +990,7 @@ unsigned VF) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; - virtual bool enableMemCmpExpansion(unsigned &MaxLoadSize) = 0; + virtual const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -1235,8 +1240,8 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } - bool enableMemCmpExpansion(unsigned &MaxLoadSize) override { - return Impl.enableMemCmpExpansion(MaxLoadSize); + const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const override { + return Impl.enableMemCmpExpansion(IsThreeWay); } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -292,7 +292,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } - bool enableMemCmpExpansion(unsigned &MaxLoadSize) { return false; } + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const { + return nullptr; + } bool enableInterleavedAccessVectorization() { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -245,8 +245,9 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } -bool TargetTransformInfo::enableMemCmpExpansion(unsigned &MaxLoadSize) const { - return TTIImpl->enableMemCmpExpansion(MaxLoadSize); +const TargetTransformInfo::MemCmpExpansionOptions * +TargetTransformInfo::enableMemCmpExpansion(bool IsThreeWay) const { + return TTIImpl->enableMemCmpExpansion(IsThreeWay); } bool TargetTransformInfo::enableInterleavedAccessVectorization() const { Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -1758,7 +1758,9 @@ Value *getMemCmpOneBlock(); public: - MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions& Options, + bool IsUsedForZeroCmp, unsigned NumLoadsPerBlock, const DataLayout &DL); unsigned getNumBlocks(); @@ -1778,26 +1780,30 @@ // 3. ResultBlock, block to branch to for early exit when a // LoadCmpBlock finds a difference. MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size, - const unsigned MaxLoadSize, + const TargetTransformInfo::MemCmpExpansionOptions& Options, + const bool IsUsedForZeroCmp, const unsigned LoadsPerBlock, const DataLayout &TheDataLayout) : CI(CI), Size(Size), - MaxLoadSize(MaxLoadSize), NumLoadsNonOneByte(0), NumLoadsPerBlock(LoadsPerBlock), - IsUsedForZeroCmp(isOnlyUsedInZeroEqualityComparison(CI)), + IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) { // Scale the max size down if the target can load more bytes than we need. - while (this->MaxLoadSize > Size) { - this->MaxLoadSize /= 2; + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; } + assert(LoadSizeIndex < Options.LoadSizes.size()); + MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. - unsigned LoadSize = this->MaxLoadSize; assert(Size > 0 && "zero blocks"); uint64_t Offset = 0; - while (Size) { + while (Size && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; assert(LoadSize > 0 && "zero load size"); const uint64_t NumLoadsForThisSize = Size / LoadSize; if (NumLoadsForThisSize > 0) { @@ -1810,11 +1816,7 @@ } Size = Size % LoadSize; } - // FIXME: This can result in a non-native load size (e.g. X86-32+SSE can - // load 16 and 4 but not 8), which throws the load count off (e.g. in the - // aforementioned case, 16 bytes will count for 2 loads but will generate - // 4). - LoadSize /= 2; + ++LoadSizeIndex; } } @@ -2346,12 +2348,13 @@ const uint64_t SizeVal = SizeCast->getZExtValue(); // TTI call to check if target would like to expand memcmp. Also, get the - // max LoadSize. - unsigned MaxLoadSize; - if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return false; + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(!IsUsedForZeroCmp); + if (!Options) return false; - MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MemCmpNumLoadsPerBlock, - *DL); + MemCmpExpansion Expansion(CI, SizeVal, *Options, IsUsedForZeroCmp, + MemCmpNumLoadsPerBlock, *DL); // Don't expand if this will require more loads than desired by the target. if (Expansion.getNumLoads() > Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -63,7 +63,7 @@ /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); - bool enableMemCmpExpansion(unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const; bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -226,9 +226,17 @@ return LoopHasReductions; } -bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { - MaxLoadSize = 8; - return true; +const PPCTTIImpl::TTI::MemCmpExpansionOptions * +PPCTTIImpl::enableMemCmpExpansion(bool IsThreeWay) const { + static const auto Options = []() { + TTI::MemCmpExpansionOptions Options; + Options.LoadSizes.push_back(8); + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return &Options; } bool PPCTTIImpl::enableInterleavedAccessVectorization() { Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -127,7 +127,7 @@ bool hasDivRemOp(Type *DataType, bool IsSigned); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - bool enableMemCmpExpansion(unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2536,10 +2536,37 @@ return (CallerBits & CalleeBits) == CalleeBits; } -bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) { - // TODO: We can increase these based on available vector ops. - MaxLoadSize = ST->is64Bit() ? 8 : 4; - return true; +const X86TTIImpl::TTI::MemCmpExpansionOptions * +X86TTIImpl::enableMemCmpExpansion(bool IsThreeWay) const { + // Only enable vector loads for equality comparison, as we don't have a + // vector bswap. + static const auto ThreeWayOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + static const auto EqZeroOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + if (ST->hasAVX512()) + Options.LoadSizes.push_back(64); + if (ST->hasAVX()) + Options.LoadSizes.push_back(32); + if (ST->hasSSE1()) + Options.LoadSizes.push_back(16); + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return IsThreeWay ? &ThreeWayOptions : &EqZeroOptions; } bool X86TTIImpl::enableInterleavedAccessVectorization() { Index: lib/Transforms/Scalar/MergeICmps.cpp =================================================================== --- lib/Transforms/Scalar/MergeICmps.cpp +++ lib/Transforms/Scalar/MergeICmps.cpp @@ -625,8 +625,7 @@ // We only try merging comparisons if the target wants to expand memcmp later. // The rationale is to avoid turning small chains into memcmp calls. - unsigned MaxLoadSize; - if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return PreservedAnalyses::all(); + if (!TTI->enableMemCmpExpansion(false)) return PreservedAnalyses::all(); bool MadeChange = false; Index: test/CodeGen/X86/memcmp-optsize.ll =================================================================== --- test/CodeGen/X86/memcmp-optsize.ll +++ test/CodeGen/X86/memcmp-optsize.ll @@ -598,22 +598,24 @@ ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; -; X64-LABEL: length16_eq: -; X64: # BB#0: # %loadbb -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq 8(%rsi), %rcx -; X64-NEXT: je .LBB17_3 -; X64-NEXT: .LBB17_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB17_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -642,22 +644,23 @@ ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; -; X64-LABEL: length16_eq_const: -; X64: # BB#0: # %loadbb -; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 -; X64-NEXT: cmpq %rax, (%rdi) -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 -; X64-NEXT: cmpq %rcx, 8(%rdi) -; X64-NEXT: je .LBB18_3 -; X64-NEXT: .LBB18_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB18_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -697,15 +700,44 @@ ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: length24_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB20_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movq 16(%rdi), %rcx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx +; X64-SSE2-NEXT: je .LBB20_3 +; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB20_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length24_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: jne .LBB20_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: movq 16(%rdi), %rcx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx +; X64-AVX2-NEXT: je .LBB20_3 +; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB20_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -724,16 +756,43 @@ ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-LABEL: length24_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq_const: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB21_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) +; X64-SSE2-NEXT: je .LBB21_3 +; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB21_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length24_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: jne .LBB21_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) +; X64-AVX2-NEXT: je .LBB21_3 +; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB21_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -761,26 +820,65 @@ ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { -; X86-LABEL: length32_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %edx +; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB23_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB23_3 +; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: incl %eax +; X86-SSE2-NEXT: .LBB23_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB23_3 +; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length32_eq: @@ -798,27 +896,60 @@ } define i1 @length32_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length32_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB24_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB24_3 +; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: incl %eax +; X86-SSE2-NEXT: .LBB24_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB24_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB24_3 +; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length32_eq_const: @@ -867,15 +998,37 @@ ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-LABEL: length64_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB26_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB26_3 +; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB26_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -894,16 +1047,38 @@ ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-LABEL: length64_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB27_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB27_3 +; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB27_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -23,7 +23,7 @@ ; X86-NEXT: movzwl %cx, %eax ; X86-NEXT: movzwl %dx, %ecx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length2: ; X64: # BB#0: @@ -34,7 +34,7 @@ ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: movzwl %cx, %ecx ; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m } @@ -47,14 +47,14 @@ ; X86-NEXT: movzwl (%ecx), %ecx ; X86-NEXT: cmpw (%eax), %cx ; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length2_eq: ; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax ; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -67,14 +67,14 @@ ; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length2_eq_const: ; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpl $12849, %eax # imm = 0x3231 ; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -91,7 +91,7 @@ ; X86-NEXT: addl $16, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length2_eq_nobuiltin_attr: ; X64: # BB#0: @@ -101,7 +101,7 @@ ; X64-NEXT: testl %eax, %eax ; X64-NEXT: sete %al ; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin %c = icmp eq i32 %m, 0 ret i1 %c @@ -124,13 +124,13 @@ ; X86-NEXT: movzbl 2(%ecx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; X86-NEXT: .LBB4_1: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: leal -1(%eax,%eax), %eax ; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length3: ; X64: # BB#0: # %loadbb @@ -144,12 +144,12 @@ ; X64-NEXT: movzbl 2(%rdi), %eax ; X64-NEXT: movzbl 2(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} ; X64-NEXT: .LBB4_1: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind ret i32 %m } @@ -172,7 +172,7 @@ ; X86-NEXT: .LBB5_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length3_eq: ; X64: # BB#0: # %loadbb @@ -189,7 +189,7 @@ ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -208,7 +208,7 @@ ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: seta %al ; X86-NEXT: sbbl $0, %eax -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length4: ; X64: # BB#0: @@ -220,7 +220,7 @@ ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: seta %al ; X64-NEXT: sbbl $0, %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind ret i32 %m } @@ -233,14 +233,14 @@ ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: cmpl (%eax), %ecx ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length4_eq: ; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax ; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -252,13 +252,13 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 ; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length4_eq_const: ; X64: # BB#0: ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 ; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -281,13 +281,13 @@ ; X86-NEXT: movzbl 4(%ecx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; X86-NEXT: .LBB9_1: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: leal -1(%eax,%eax), %eax ; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length5: ; X64: # BB#0: # %loadbb @@ -301,12 +301,12 @@ ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} ; X64-NEXT: .LBB9_1: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind ret i32 %m } @@ -329,7 +329,7 @@ ; X86-NEXT: .LBB10_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length5_eq: ; X64: # BB#0: # %loadbb @@ -346,7 +346,7 @@ ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -374,14 +374,14 @@ ; X86-NEXT: jne .LBB11_1 ; X86-NEXT: # BB#3: # %endblock ; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; X86-NEXT: .LBB11_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax ; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length8: ; X64: # BB#0: @@ -393,7 +393,7 @@ ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: seta %al ; X64-NEXT: sbbl $0, %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind ret i32 %m } @@ -416,14 +416,14 @@ ; X86-NEXT: .LBB12_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length8_eq: ; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax ; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -444,14 +444,14 @@ ; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length8_eq_const: ; X64: # BB#0: ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 ; X64-NEXT: cmpq %rax, (%rdi) ; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -468,7 +468,7 @@ ; X86-NEXT: addl $16, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length12_eq: ; X64: # BB#0: # %loadbb @@ -485,7 +485,7 @@ ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -500,7 +500,7 @@ ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll memcmp ; X86-NEXT: addl $16, %esp -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length12: ; X64: # BB#0: # %loadbb @@ -519,13 +519,13 @@ ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB15_1 ; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} ; X64-NEXT: .LBB15_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m } @@ -541,7 +541,7 @@ ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll memcmp ; X86-NEXT: addl $16, %esp -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length16: ; X64: # BB#0: # %loadbb @@ -560,13 +560,13 @@ ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB16_1 ; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} ; X64-NEXT: .LBB16_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: retq +; X64-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m } @@ -582,7 +582,7 @@ ; X86-NOSSE-NEXT: addl $16, %esp ; X86-NOSSE-NEXT: testl %eax, %eax ; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl +; X86-NOSSE-NEXT: ret{{[l|q]}} ; ; X86-SSE2-LABEL: length16_eq: ; X86-SSE2: # BB#0: @@ -594,24 +594,26 @@ ; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl +; X86-SSE2-NEXT: ret{{[l|q]}} ; -; X64-LABEL: length16_eq: -; X64: # BB#0: # %loadbb -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq 8(%rsi), %rcx -; X64-NEXT: je .LBB17_3 -; X64-NEXT: .LBB17_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB17_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: ret{{[l|q]}} +; +; X64-AVX2-LABEL: length16_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: ret{{[l|q]}} %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -628,7 +630,7 @@ ; X86-NOSSE-NEXT: addl $16, %esp ; X86-NOSSE-NEXT: testl %eax, %eax ; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl +; X86-NOSSE-NEXT: ret{{[l|q]}} ; ; X86-SSE2-LABEL: length16_eq_const: ; X86-SSE2: # BB#0: @@ -638,24 +640,25 @@ ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl +; X86-SSE2-NEXT: ret{{[l|q]}} ; -; X64-LABEL: length16_eq_const: -; X64: # BB#0: # %loadbb -; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 -; X64-NEXT: cmpq %rax, (%rdi) -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 -; X64-NEXT: cmpq %rcx, 8(%rdi) -; X64-NEXT: je .LBB18_3 -; X64-NEXT: .LBB18_1: # %res_block -; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB18_3: # %endblock -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: retq +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: ret{{[l|q]}} +; +; X64-AVX2-LABEL: length16_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -672,7 +675,7 @@ ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll memcmp ; X86-NEXT: addl $16, %esp -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length24: ; X64: # BB#0: @@ -693,17 +696,46 @@ ; X86-NEXT: addl $16, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; -; X64-LABEL: length24_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB20_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movq 16(%rdi), %rcx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx +; X64-SSE2-NEXT: je .LBB20_3 +; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB20_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: ret{{[l|q]}} +; +; X64-AVX2-LABEL: length24_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: jne .LBB20_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: movq 16(%rdi), %rcx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx +; X64-AVX2-NEXT: je .LBB20_3 +; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB20_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: ret{{[l|q]}} %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -720,18 +752,45 @@ ; X86-NEXT: addl $16, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; -; X64-LABEL: length24_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $24, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length24_eq_const: +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB21_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) +; X64-SSE2-NEXT: je .LBB21_3 +; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB21_3: # %endblock +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: ret{{[l|q]}} +; +; X64-AVX2-LABEL: length24_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: jne .LBB21_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 +; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) +; X64-AVX2-NEXT: je .LBB21_3 +; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB21_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -746,7 +805,7 @@ ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll memcmp ; X86-NEXT: addl $16, %esp -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length32: ; X64: # BB#0: @@ -759,27 +818,65 @@ ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length32_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: ret{{[l|q]}} +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %edx +; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB23_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB23_3 +; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: .LBB23_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB23_3 +; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq +; X64-SSE2-NEXT: ret{{[l|q]}} ; ; X64-AVX2-LABEL: length32_eq: ; X64-AVX2: # BB#0: @@ -789,35 +886,67 @@ ; X64-AVX2-NEXT: cmpl $-1, %eax ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-AVX2-NEXT: ret{{[l|q]}} %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp } define i1 @length32_eq_const(i8* %X) nounwind { -; X86-LABEL: length32_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: ret{{[l|q]}} +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: jne .LBB24_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X86-SSE2-NEXT: xorl %eax, %eax +; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X86-SSE2-NEXT: je .LBB24_3 +; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: .LBB24_3: # %endblock +; X86-SSE2-NEXT: testl %eax, %eax +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp +; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: jne .LBB24_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx +; X64-SSE2-NEXT: xorl %eax, %eax +; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; X64-SSE2-NEXT: je .LBB24_3 +; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq +; X64-SSE2-NEXT: ret{{[l|q]}} ; ; X64-AVX2-LABEL: length32_eq_const: ; X64-AVX2: # BB#0: @@ -827,7 +956,7 @@ ; X64-AVX2-NEXT: cmpl $-1, %eax ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-AVX2-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -842,7 +971,7 @@ ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll memcmp ; X86-NEXT: addl $16, %esp -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; ; X64-LABEL: length64: ; X64: # BB#0: @@ -863,17 +992,39 @@ ; X86-NEXT: addl $16, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; -; X64-LABEL: length64_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: ret{{[l|q]}} +; +; X64-AVX2-LABEL: length64_eq: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB26_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB26_3 +; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB26_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: ret{{[l|q]}} %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -890,18 +1041,40 @@ ; X86-NEXT: addl $16, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NEXT: ret{{[l|q]}} ; -; X64-LABEL: length64_eq_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $64, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; X64-SSE2-LABEL: length64_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: ret{{[l|q]}} +; +; X64-AVX2-LABEL: length64_eq_const: +; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: jne .LBB27_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: cmpl $-1, %ecx +; X64-AVX2-NEXT: je .LBB27_3 +; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: .LBB27_3: # %endblock +; X64-AVX2-NEXT: testl %eax, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: ret{{[l|q]}} %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c Index: test/Transforms/CodeGenPrepare/X86/memcmp.ll =================================================================== --- test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -753,27 +753,13 @@ ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq16( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] -; X64: res_block: -; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1 -; X64-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1 -; X64-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] -; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* +; X64-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i128, i128* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 ; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X64-NEXT: ret i32 [[CONV]] ;