diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -557,6 +557,25 @@ return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); } +// Is all uses of this instructions are icmps with the same predicate, returns +// this predicate. Else, returns BAD_ICMP_PREDICATE. +static CmpInst::Predicate getSingleUsagePredicate(const Instruction *I) { + CmpInst::Predicate Usage = CmpInst::BAD_ICMP_PREDICATE; + for (const User *U : I->users()) { + const ICmpInst *const IC = dyn_cast(U); + if (!IC) + return CmpInst::BAD_ICMP_PREDICATE; + Constant *const C = dyn_cast(IC->getOperand(1)); + if (!(C && C->isNullValue())) + return CmpInst::BAD_ICMP_PREDICATE; + const auto Pred = IC->getPredicate(); + if (Usage != CmpInst::BAD_ICMP_PREDICATE && Pred != Usage) + return CmpInst::BAD_ICMP_PREDICATE; // Inconsistent predicates. + Usage = Pred; + } + return Usage; +} + /// A memcmp expansion that only has one block of load and compare can bypass /// the compare, branch, and phi IR that is required in the general case. Value *MemCmpExpansion::getMemCmpOneBlock() { @@ -589,17 +608,39 @@ return Builder.CreateSub(LoadSrc1, LoadSrc2); } - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); + const auto Pred = getSingleUsagePredicate(CI); + switch (Pred) { + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_NE: + llvm_unreachable("EQ/NE is handled specifically with IsUsedForZeroCmp"); + case CmpInst::ICMP_SLT: { + // `memcmp(a, b, N) < 0` <=> `*a < *b` + // <=> `icmp ult *a *b == 1` + // <=> `signext(icmp ult *a *b) < 0` + Value* Cmp = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + return Builder.CreateSExt(Cmp, Builder.getInt32Ty()); + } + case CmpInst::ICMP_SGT: { + // `memcmp(a, b, N) > 0` <=> `*a > *b` + // <=> `icmp ugt *a *b == 1` + // <=> `icmp ugt *a *b > 0` + Value* Cmp = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + return Builder.CreateZExt(Cmp, Builder.getInt32Ty()); + } + default: { + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to + // math) may not be possible in the DAG because the selects got converte + // into branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); + } + } } // This function expands the memcmp call into an inline expansion and returns diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -352,29 +352,21 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %eax ; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: length4_lt: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %al -; X64-NEXT: sbbl $0, %eax -; X64-NEXT: shrl $31, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind %c = icmp slt i32 %m, 0 @@ -393,7 +385,6 @@ ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: seta %dl -; X86-NEXT: sbbl $0, %edx ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setg %al ; X86-NEXT: retl @@ -407,7 +398,6 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: seta %dl -; X64-NEXT: sbbl $0, %edx ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al ; X64-NEXT: retq