Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -581,13 +581,17 @@ struct MemCmpExpansionOptions { // The list of available load sizes (in bytes), sorted in decreasing order. SmallVector LoadSizes; + // Whether to allow overlapping loads, e.g. 7-byte compares can be done with + // two 4-byte compares instead of 4+2+1-byte compares. This requires all + // loads in LoadSizes to be doable in an unaligned way. + bool AllowOverlappingLoads = false; }; const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const; /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; - /// Enable matching of interleaved access groups that contain predicated + /// Enable matching of interleaved access groups that contain predicated /// accesses or gaps and therefore vectorized using masked /// vector loads/stores. bool enableMaskedInterleavedAccessVectorization() const; @@ -772,7 +776,7 @@ /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and /// extraction shuffle kinds to show the insert/extract point and the type of - /// the subvector being inserted/extracted. + /// the subvector being inserted/extracted. /// NOTE: For subvector extractions Tp represents the source type. int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0, Type *SubTp = nullptr) const; Index: lib/CodeGen/ExpandMemCmp.cpp =================================================================== --- lib/CodeGen/ExpandMemCmp.cpp +++ lib/CodeGen/ExpandMemCmp.cpp @@ -66,23 +66,18 @@ // Represents the decomposition in blocks of the expansion. For example, // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. struct LoadEntry { LoadEntry(unsigned LoadSize, uint64_t Offset) : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); } - uint64_t getGEPIndex() const { return Offset / LoadSize; } - // The size of the load for this block, in bytes. - const unsigned LoadSize; + unsigned LoadSize; // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; + uint64_t Offset; }; - SmallVector LoadSequence; + using LoadEntryVector = SmallVector; + LoadEntryVector LoadSequence; void createLoadCmpBlocks(); void createResultBlock(); @@ -92,13 +87,23 @@ void emitLoadCompareBlock(unsigned BlockIndex); void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes); void emitMemCmpResultBlock(); Value *getMemCmpExpansionZeroCase(); Value *getMemCmpEqZeroOneBlock(); Value *getMemCmpOneBlock(); + Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType, + uint64_t OffsetBytes); + + static LoadEntryVector + computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef LoadSizes, + unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte); + static LoadEntryVector + computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize, + unsigned MaxNumLoads, + unsigned &NumLoadsNonOneByte); - public: +public: MemCmpExpansion(CallInst *CI, uint64_t Size, const TargetTransformInfo::MemCmpExpansionOptions &Options, unsigned MaxNumLoads, const bool IsUsedForZeroCmp, @@ -110,6 +115,65 @@ Value *getMemCmpExpansion(); }; +MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence( + uint64_t Size, llvm::ArrayRef LoadSizes, + const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) { + NumLoadsNonOneByte = 0; + LoadEntryVector LoadSequence; + uint64_t Offset = 0; + while (Size && !LoadSizes.empty()) { + const unsigned LoadSize = LoadSizes.front(); + const uint64_t NumLoadsForThisSize = Size / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + return {}; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + Size = Size % LoadSize; + } + LoadSizes = LoadSizes.drop_front(); + } + return LoadSequence; +} + +MemCmpExpansion::LoadEntryVector +MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size, + const unsigned MaxLoadSize, + const unsigned MaxNumLoads, + unsigned &NumLoadsNonOneByte) { + + const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize; + Size = Size - NumNonOverlappingLoads * MaxLoadSize; + if (NumNonOverlappingLoads + !!(Size > 0) > MaxNumLoads) + return {}; + + // Add non-overlapping loads. + LoadEntryVector LoadSequence; + uint64_t Offset = 0; + for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) { + LoadSequence.push_back({MaxLoadSize, Offset}); + Offset += MaxLoadSize; + } + + // Add the last overlapping load. + if (Size > 0) { + assert(Size < MaxLoadSize && "broken invariant"); + LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)}); + } + NumLoadsNonOneByte = 1; + return LoadSequence; +} + // Initialize the basic block structure required for expansion of memcmp call // with given maximum load size and memcmp size parameter. // This structure includes: @@ -133,38 +197,31 @@ Builder(CI) { assert(Size > 0 && "zero blocks"); // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; + llvm::ArrayRef LoadSizes(Options.LoadSizes); + while (!LoadSizes.empty() && LoadSizes.front() > Size) { + LoadSizes = LoadSizes.drop_front(); } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(!LoadSizes.empty() && "cannot load Size bytes"); + MaxLoadSize = LoadSizes.front(); // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; + unsigned GreedyNumLoadsNonOneByte = 0; + LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads, + GreedyNumLoadsNonOneByte); + NumLoadsNonOneByte = GreedyNumLoadsNonOneByte; + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); + // If we allow overlapping loads and the load sequence is not already optimal, + // use overlapping loads. + if (Options.AllowOverlappingLoads && + (LoadSequence.empty() || LoadSequence.size() > 2)) { + unsigned OverlappingNumLoadsNonOneByte = 0; + auto OverlappingLoads = computeOverlappingLoadSequence( + Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte); + if (!OverlappingLoads.empty() && + (LoadSequence.empty() || + OverlappingLoads.size() < LoadSequence.size())) { + LoadSequence = OverlappingLoads; + NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte; } - ++LoadSizeIndex; } assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); } @@ -189,30 +246,32 @@ EndBlock->getParent(), EndBlock); } +/// Returns a pointer to an element of type `LoadSizeType` at offset +/// `OffsetBytes`. +Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source, + Type *LoadSizeType, + uint64_t OffsetBytes) { + if (OffsetBytes > 0) { + auto *ByteType = Type::getInt8Ty(CI->getContext()); + Source = Builder.CreateGEP( + ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()), + ConstantInt::get(ByteType, OffsetBytes)); + } + return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo()); +} + // This function creates the IR instructions for loading and comparing 1 byte. // It loads 1 byte from each source of the memcmp parameters with the given // GEPIndex. It then subtracts the two loaded values and adds this result to the // final phi node for selecting the memcmp result. void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - + unsigned OffsetBytes) { Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } + Value *Source1 = + getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes); + Value *Source2 = + getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes); Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); @@ -270,24 +329,10 @@ IntegerType *LoadSizeType = IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } + Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, + CurLoadEntry.Offset); + Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, + CurLoadEntry.Offset); // Get a constant or load a value for each source address. Value *LoadSrc1 = nullptr; @@ -378,8 +423,7 @@ const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset); return; } @@ -388,25 +432,12 @@ Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } + Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, + CurLoadEntry.Offset); + Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, + CurLoadEntry.Offset); // Load LoadSizeType from the base address. Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); @@ -694,7 +725,6 @@ if (SizeVal == 0) { return false; } - // TTI call to check if target would like to expand memcmp. Also, get the // available load sizes. const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1886,7 +1886,7 @@ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::BITREVERSE, MVT::i64, 14 } + { ISD::BITREVERSE, MVT::i64, 14 } }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets { ISD::BITREVERSE, MVT::i32, 14 }, @@ -2899,6 +2899,8 @@ Options.LoadSizes.push_back(4); Options.LoadSizes.push_back(2); Options.LoadSizes.push_back(1); + // All GPR loads can be unaligned, and vector loads too starting form SSE2. + Options.AllowOverlappingLoads = true; return Options; }(); return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; Index: test/CodeGen/X86/memcmp-optsize.ll =================================================================== --- test/CodeGen/X86/memcmp-optsize.ll +++ test/CodeGen/X86/memcmp-optsize.ll @@ -639,17 +639,33 @@ } define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { -; X86-LABEL: length24_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: @@ -683,17 +699,30 @@ } define i1 @length24_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length24_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -362,24 +362,24 @@ define i1 @length7_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length7_eq: ; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $7 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 3(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 3(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length7_eq: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $7, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl 3(%rdi), %ecx +; X64-NEXT: xorl (%rsi), %eax +; X64-NEXT: xorl 3(%rsi), %ecx +; X64-NEXT: orl %eax, %ecx ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind %c = icmp ne i32 %m, 0 @@ -548,12 +548,12 @@ ; ; X64-LABEL: length11_eq: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $11, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq 3(%rdi), %rcx +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: xorq 3(%rsi), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind %c = icmp eq i32 %m, 0 @@ -640,12 +640,12 @@ ; ; X64-LABEL: length13_eq: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $13, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq 5(%rdi), %rcx +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: xorq 5(%rsi), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind %c = icmp eq i32 %m, 0 @@ -667,12 +667,12 @@ ; ; X64-LABEL: length14_eq: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $14, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq 6(%rdi), %rcx +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: xorq 6(%rsi), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind %c = icmp eq i32 %m, 0 @@ -694,12 +694,12 @@ ; ; X64-LABEL: length15_eq: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $15, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq 7(%rdi), %rcx +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: xorq 7(%rsi), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind %c = icmp eq i32 %m, 0 @@ -885,17 +885,45 @@ } define i1 @length24_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length24_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length24_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $0 +; X86-SSE1-NEXT: pushl $24 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $16, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: @@ -929,17 +957,42 @@ } define i1 @length24_eq_const(i8* %X) nounwind { -; X86-LABEL: length24_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length24_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $0 +; X86-SSE1-NEXT: pushl $24 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $16, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: Index: test/Transforms/ExpandMemCmp/X86/memcmp.ll =================================================================== --- test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -130,11 +130,11 @@ ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 -; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16* +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16* +; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP11]] ; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) ; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) @@ -178,11 +178,11 @@ ; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] ; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 -; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32* +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32* +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP11]] ; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) ; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) @@ -272,11 +272,11 @@ ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] ; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 -; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16* +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP11]] ; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) ; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) @@ -324,11 +324,11 @@ ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] ; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 -; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32* +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP11]] ; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) ; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) @@ -394,11 +394,11 @@ ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] ; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i64* +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i64* +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP11]] ; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) ; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) @@ -597,11 +597,11 @@ ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X32-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; X32-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 -; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] ; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 ; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 @@ -625,11 +625,11 @@ ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] ; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 -; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] ; X64_1LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] ; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] @@ -645,11 +645,11 @@ ; X64_2LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X64_2LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 -; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] ; X64_2LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 ; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 @@ -668,11 +668,71 @@ } define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq7( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq7( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq7( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) %cmp = icmp eq i32 %call, 0 @@ -687,11 +747,11 @@ ; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +; X32-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] ; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] ; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] ; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] @@ -794,11 +854,11 @@ ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] ; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 -; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] ; X64_1LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] ; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] @@ -814,11 +874,11 @@ ; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] ; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 -; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP7]] ; X64_2LD-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] ; X64_2LD-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i64 ; X64_2LD-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i64 @@ -837,11 +897,57 @@ } define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq11( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq11( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq11( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 3 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 3 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) %cmp = icmp eq i32 %call, 0 @@ -868,11 +974,11 @@ ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] ; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64_1LD: loadbb1: -; X64_1LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* -; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64_1LD-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 -; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] ; X64_1LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] ; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] ; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] @@ -888,11 +994,11 @@ ; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] ; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] -; X64_2LD-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* -; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64_2LD-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 -; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP7]] ; X64_2LD-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] ; X64_2LD-NEXT: [[TMP12:%.*]] = zext i32 [[TMP10]] to i64 ; X64_2LD-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 @@ -911,11 +1017,57 @@ } define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq13( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq13( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 5 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 5 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq13( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 5 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 5 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) %cmp = icmp eq i32 %call, 0 @@ -924,11 +1076,57 @@ } define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq14( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq14( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq14( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) %cmp = icmp eq i32 %call, 0 @@ -937,11 +1135,57 @@ } define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64_1LD-LABEL: @cmp_eq15( +; X64_1LD-NEXT: br label [[LOADBB:%.*]] +; X64_1LD: res_block: +; X64_1LD-NEXT: br label [[ENDBLOCK:%.*]] +; X64_1LD: loadbb: +; X64_1LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_1LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_1LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_1LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD: loadbb1: +; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 7 +; X64_1LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_1LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 7 +; X64_1LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_1LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_1LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_1LD-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] +; X64_1LD-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD: endblock: +; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64_1LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_1LD-NEXT: ret i32 [[CONV]] +; +; X64_2LD-LABEL: @cmp_eq15( +; X64_2LD-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64_2LD-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64_2LD-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64_2LD-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64_2LD-NEXT: [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]] +; X64_2LD-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 7 +; X64_2LD-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64* +; X64_2LD-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i8 7 +; X64_2LD-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64* +; X64_2LD-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP7]] +; X64_2LD-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP9]] +; X64_2LD-NEXT: [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]] +; X64_2LD-NEXT: [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]] +; X64_2LD-NEXT: [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0 +; X64_2LD-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X64_2LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X64_2LD-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64_2LD-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) %cmp = icmp eq i32 %call, 0