Index: include/llvm/Transforms/Utils/BuildLibCalls.h =================================================================== --- include/llvm/Transforms/Utils/BuildLibCalls.h +++ include/llvm/Transforms/Utils/BuildLibCalls.h @@ -93,6 +93,10 @@ Value *emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, const DataLayout &DL, const TargetLibraryInfo *TLI); + /// Emit a call to the bcmp function. + Value *emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI); + /// Emit a call to the unary function named 'Name' (e.g. 'floor'). This /// function is known to take a single of type matching 'Op' and returns one /// value with the same type. If 'Op' is a long double, 'l' is added as the Index: lib/Analysis/TargetLibraryInfo.cpp =================================================================== --- lib/Analysis/TargetLibraryInfo.cpp +++ lib/Analysis/TargetLibraryInfo.cpp @@ -50,6 +50,16 @@ return true; } +static bool hasBcmp(const Triple &TT) { + // Posix removed support from bcmp() in 2001, but the glibc and several + // implementations of the libc still have it. + if (TT.isOSLinux()) + return TT.isGNUEnvironment() || TT.isMusl(); + // Both NetBSD and OpenBSD are planning to remove the function. Windows does + // not have it. + return TT.isOSFreeBSD() || TT.isOSSolaris() || TT.isOSDarwin(); +} + /// Initialize the set of available library functions based on the specified /// target triple. This should be carefully written so that a missing target /// triple gets a sane set of defaults. @@ -142,6 +152,9 @@ TLI.setUnavailable(LibFunc_sincospif_stret); } + if (!hasBcmp(T)) + TLI.setUnavailable(LibFunc_bcmp); + if (T.isMacOSX() && T.getArch() == Triple::x86 && !T.isMacOSXVersionLT(10, 7)) { // x86-32 OSX has a scheme where fwrite and fputs (and some other functions Index: lib/Transforms/Utils/BuildLibCalls.cpp =================================================================== --- lib/Transforms/Utils/BuildLibCalls.cpp +++ lib/Transforms/Utils/BuildLibCalls.cpp @@ -924,27 +924,40 @@ return CI; } -Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, - const DataLayout &DL, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc_memcmp)) +// Common code for memcmp() and bcmp(), which have the exact same properties, +// just a slight difference in semantics. +static Value *emitMemCmpOrBcmp(llvm::LibFunc libfunc, Value *Ptr1, Value *Ptr2, + Value *Len, IRBuilder<> &B, const DataLayout &DL, + const TargetLibraryInfo *TLI) { + if (!TLI->has(libfunc)) return nullptr; Module *M = B.GetInsertBlock()->getModule(); - StringRef MemCmpName = TLI->getName(LibFunc_memcmp); + StringRef CmpFnName = TLI->getName(libfunc); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemCmp = M->getOrInsertFunction(MemCmpName, B.getInt32Ty(), - B.getInt8PtrTy(), B.getInt8PtrTy(), - DL.getIntPtrType(Context)); - inferLibFuncAttributes(M, MemCmpName, *TLI); + Value *CmpFn = + M->getOrInsertFunction(CmpFnName, B.getInt32Ty(), B.getInt8PtrTy(), + B.getInt8PtrTy(), DL.getIntPtrType(Context)); + inferLibFuncAttributes(M, CmpFnName, *TLI); CallInst *CI = B.CreateCall( - MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, MemCmpName); + CmpFn, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, CmpFnName); - if (const Function *F = dyn_cast(MemCmp->stripPointerCasts())) + if (const Function *F = dyn_cast(CmpFn->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); return CI; } +Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI) { + return emitMemCmpOrBcmp(LibFunc_memcmp, Ptr1, Ptr2, Len, B, DL, TLI); +} + +Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, + const DataLayout &DL, const TargetLibraryInfo *TLI) { + return emitMemCmpOrBcmp(LibFunc_bcmp, Ptr1, Ptr2, Len, B, DL, TLI); +} + /// Append a suffix to the function name according to the type of 'Op'. static void appendTypeSuffix(Value *Op, StringRef &Name, SmallString<20> &NameBuffer) { Index: lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- lib/Transforms/Utils/SimplifyLibCalls.cpp +++ lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -825,18 +825,9 @@ return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr"); } -Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { - Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); - - if (LHS == RHS) // memcmp(s,s,x) -> 0 - return Constant::getNullValue(CI->getType()); - - // Make sure we have a constant length. - ConstantInt *LenC = dyn_cast(CI->getArgOperand(2)); - if (!LenC) - return nullptr; - - uint64_t Len = LenC->getZExtValue(); +static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS, + uint64_t Len, IRBuilder<> &B, + const DataLayout &DL) { if (Len == 0) // memcmp(s1,s2,0) -> 0 return Constant::getNullValue(CI->getType()); @@ -904,6 +895,28 @@ Ret = 1; return ConstantInt::get(CI->getType(), Ret); } + return nullptr; +} + +Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { + Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); + Value *Size = CI->getArgOperand(2); + + if (LHS == RHS) // memcmp(s,s,x) -> 0 + return Constant::getNullValue(CI->getType()); + + // Handle constant lengths. + if (ConstantInt *LenC = dyn_cast(Size)) + if (Value *Res = optimizeMemCmpConstantSize(CI, LHS, RHS, + LenC->getZExtValue(), B, DL)) + return Res; + + // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0 + // `bcmp` can be more efficient than memcmp because it only has to know that + // there is a difference, not where is is. + if (isOnlyUsedInZeroEqualityComparison(CI) && TLI->has(LibFunc_bcmp)) { + return emitBCmp(LHS, RHS, Size, B, DL, TLI); + } return nullptr; } @@ -1129,10 +1142,10 @@ IRBuilder<> &B) { if (!isa(Call)) return nullptr; - + IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(Call->getFastMathFlags()); - + // TODO: Can this be shared to also handle LLVM intrinsics? Value *X; switch (Func) { Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -1344,3 +1344,70 @@ %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind ret i32 %m } + +define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: huge_length_eq: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks non-constant sizes. +define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { +; X86-LABEL: nonconst_length: +; X86: # %bb.0: +; X86-NEXT: jmp memcmp # TAILCALL +; +; X64-LABEL: nonconst_length: +; X64: # %bb.0: +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind + ret i32 %m +} + +define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind { +; X86-LABEL: nonconst_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: nonconst_length_eq: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} Index: test/Transforms/InferFunctionAttrs/annotate.ll =================================================================== --- test/Transforms/InferFunctionAttrs/annotate.ll +++ test/Transforms/InferFunctionAttrs/annotate.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -mtriple=x86_64-- -inferattrs -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-- -passes=inferattrs -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-- -inferattrs -S | FileCheck -check-prefix=CHECK-UNKNOWN %s +; RUN: opt < %s -mtriple=x86_64-- -passes=inferattrs -S | FileCheck -check-prefix=CHECK-UNKNOWN %s ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-DARWIN %s ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LINUX %s ; RUN: opt < %s -mtriple=nvptx -inferattrs -S | FileCheck -check-prefix=CHECK-NVPTX %s @@ -241,7 +241,10 @@ ; CHECK: declare i64 @atoll(i8* nocapture) [[G1]] declare i64 @atoll(i8*) -; CHECK: declare i32 @bcmp(i8* nocapture, i8* nocapture, i64) [[G1]] +; CHECK-DARWIN: declare i32 @bcmp(i8* nocapture, i8* nocapture, i64) [[G1]] +; CHECK-LINUX: declare i32 @bcmp(i8* nocapture, i8* nocapture, i64) [[G1]] +; CHECK-UNKNOWN-NOT: declare i32 @bcmp(i8* nocapture, i8* nocapture, i64) [[G1]] +; CHECK-NVPTX-NOT: declare i32 @bcmp(i8* nocapture, i8* nocapture, i64) [[G1]] declare i32 @bcmp(i8*, i8*, i64) ; CHECK: declare void @bcopy(i8* nocapture readonly, i8* nocapture, i64) [[G0]] Index: test/Transforms/InstCombine/memcmp-1.ll =================================================================== --- test/Transforms/InstCombine/memcmp-1.ll +++ test/Transforms/InstCombine/memcmp-1.ll @@ -1,6 +1,7 @@ ; Test that the memcmp library call simplifier works correctly. ; -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -S | FileCheck --check-prefix=CHECK --check-prefix=NOBCMP %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK --check-prefix=BCMP %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32:64" @@ -130,3 +131,21 @@ %cmp = icmp eq i32 %call, 0 ret i1 %cmp } + +; Check memcmp(mem1, mem2, size)==0 -> bcmp(mem1, mem2, size)==0 + +define i1 @test_simplify10(i8* %mem1, i8* %mem2, i32 %size) { +; NOBCMP-LABEL: @test_simplify10( +; NOBCMP-NEXT: [[CALL:%.*]] = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 %size) +; NOBCMP-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; NOBCMP-NEXT: ret i1 [[CMP]] +; +; BCMP-LABEL: @test_simplify10( +; BCMP-NEXT: [[CALL:%.*]] = call i32 @bcmp(i8* %mem1, i8* %mem2, i32 %size) +; BCMP-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; BCMP-NEXT: ret i1 [[CMP]] +; + %call = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 %size) + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} Index: tools/llvm-exegesis/lib/Clustering.cpp =================================================================== --- tools/llvm-exegesis/lib/Clustering.cpp +++ tools/llvm-exegesis/lib/Clustering.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "Clustering.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include @@ -91,14 +92,8 @@ } void InstructionBenchmarkClustering::dbScan(const size_t MinPts) { - const size_t NumPoints = Points_.size(); - - // Persistent buffers to avoid allocs. - std::vector Neighbors; - std::vector ToProcess(NumPoints); - std::vector Processed(NumPoints); - - for (size_t P = 0; P < NumPoints; ++P) { + std::vector Neighbors; // Persistent buffer to avoid allocs. + for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (!ClusterIdForPoint_[P].isUndef()) continue; // Previously processed in inner loop. rangeQuery(P, Neighbors); @@ -114,40 +109,43 @@ Cluster &CurrentCluster = Clusters_.back(); ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ CurrentCluster.PointIndices.push_back(P); - Processed[P] = 1; - // Enqueue P's neighbors. - size_t Tail = 0; - auto EnqueueUnprocessed = [&](const std::vector &Neighbors) { - for (size_t Q : Neighbors) - if (!Processed[Q]) { - ToProcess[Tail++] = Q; - Processed[Q] = 1; - } - }; - EnqueueUnprocessed(Neighbors); - - for (size_t Head = 0; Head < Tail; ++Head) { - // Retrieve a point from the queue and add it to the current cluster. - P = ToProcess[Head]; - ClusterId OldCID = ClusterIdForPoint_[P]; - ClusterIdForPoint_[P] = CurrentCluster.Id; - CurrentCluster.PointIndices.push_back(P); - if (OldCID.isNoise()) + // Process P's neighbors. + llvm::SetVector> ToProcess; + ToProcess.insert(Neighbors.begin(), Neighbors.end()); + while (!ToProcess.empty()) { + // Retrieve a point from the set. + const size_t Q = *ToProcess.begin(); + ToProcess.erase(ToProcess.begin()); + + if (ClusterIdForPoint_[Q].isNoise()) { + // Change noise point to border point. + ClusterIdForPoint_[Q] = CurrentCluster.Id; + CurrentCluster.PointIndices.push_back(Q); continue; - assert(OldCID.isUndef()); - - // And extend to the neighbors of P if the region is dense enough. - rangeQuery(P, Neighbors); - if (Neighbors.size() + 1 >= MinPts) - EnqueueUnprocessed(Neighbors); + } + if (!ClusterIdForPoint_[Q].isUndef()) { + continue; // Previously processed. + } + // Add Q to the current custer. + ClusterIdForPoint_[Q] = CurrentCluster.Id; + CurrentCluster.PointIndices.push_back(Q); + // And extend to the neighbors of Q if the region is dense enough. + rangeQuery(Q, Neighbors); + if (Neighbors.size() + 1 >= MinPts) { + ToProcess.insert(Neighbors.begin(), Neighbors.end()); + } } } + // assert(Neighbors.capacity() == (Points_.size() - 1)); + // ^ True, but it is not quaranteed to be true in all the cases. // Add noisy points to noise cluster. - for (size_t P = 0; P < NumPoints; ++P) - if (ClusterIdForPoint_[P].isNoise()) + for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { + if (ClusterIdForPoint_[P].isNoise()) { NoiseCluster_.PointIndices.push_back(P); + } + } } llvm::Expected