Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -586,6 +586,12 @@ /// \brief Return true if the hardware has a fast square-root instruction. bool haveFastSqrt(Type *Ty) const; + /// Return true if it is faster to check if a floating-point value is NaN + /// (or not-NaN) versus a comparison against a constant FP zero value. + /// Targets should override this if materializing a 0.0 for comparison is + /// generally as cheap as checking for ordered/unordered. + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const; + /// \brief Return the expected cost of supporting the floating point operation /// of the specified type. int getFPOpCost(Type *Ty) const; @@ -1009,6 +1015,7 @@ bool *Fast) = 0; virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; virtual bool haveFastSqrt(Type *Ty) = 0; + virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0; virtual int getFPOpCost(Type *Ty) = 0; virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty) = 0; @@ -1273,6 +1280,10 @@ } bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); } + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override { + return Impl.isFCmpOrdCheaperThanFCmpZero(Ty); + } + int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); } int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm, Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -316,6 +316,8 @@ bool haveFastSqrt(Type *Ty) { return false; } + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; } + unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; } int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -297,6 +297,10 @@ TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); } + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { + return true; + } + unsigned getFPOpCost(Type *Ty) { // By default, FP instructions are no more expensive since they are // implemented in HW. Target specific TTI can override this. Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -281,6 +281,10 @@ return TTIImpl->haveFastSqrt(Ty); } +bool TargetTransformInfo::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { + return TTIImpl->isFCmpOrdCheaperThanFCmpZero(Ty); +} + int TargetTransformInfo::getFPOpCost(Type *Ty) const { int Cost = TTIImpl->getFPOpCost(Ty); assert(Cost >= 0 && "TTI should not produce negative costs!"); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -125,6 +125,7 @@ bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); bool hasDivRemOp(Type *DataType, bool IsSigned); + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2537,6 +2537,10 @@ return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); } +bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { + return false; +} + bool X86TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); Index: llvm/trunk/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ llvm/trunk/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -26,7 +26,8 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, - BasicBlock &CurrBB, Function::iterator &BB) { + BasicBlock &CurrBB, Function::iterator &BB, + const TargetTransformInfo *TTI) { // There is no need to change the IR, since backend will emit sqrt // instruction if the call has already been marked read-only. if (Call->onlyReadsMemory()) @@ -39,7 +40,7 @@ // // (after) // v0 = sqrt_noreadmem(src) # native sqrt instruction. - // if (v0 is a NaN) + // [if (v0 is a NaN) || if (src < 0)] // v1 = sqrt(src) # library call. // dst = phi(v0, v1) // @@ -48,7 +49,8 @@ // Create phi and replace all uses. BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode()); IRBuilder<> Builder(JoinBB, JoinBB->begin()); - PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); + Type *Ty = Call->getType(); + PHINode *Phi = Builder.CreatePHI(Ty, 2); Call->replaceAllUsesWith(Phi); // Create basic block LibCallBB and insert a call to library function sqrt. @@ -65,7 +67,10 @@ Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); CurrBB.getTerminator()->eraseFromParent(); Builder.SetInsertPoint(&CurrBB); - Value *FCmp = Builder.CreateFCmpOEQ(Call, Call); + Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty) + ? Builder.CreateFCmpORD(Call, Call) + : Builder.CreateFCmpOGE(Call->getOperand(0), + ConstantFP::get(Ty, 0.0)); Builder.CreateCondBr(FCmp, JoinBB, LibCallBB); // Add phi operands. @@ -106,7 +111,7 @@ case LibFunc_sqrtf: case LibFunc_sqrt: if (TTI->haveFastSqrt(Call->getType()) && - optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) + optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI)) break; continue; default: Index: llvm/trunk/test/CodeGen/X86/sqrt-partial.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sqrt-partial.ll +++ llvm/trunk/test/CodeGen/X86/sqrt-partial.ll @@ -3,7 +3,7 @@ ; PR31455 - https://bugs.llvm.org/show_bug.cgi?id=31455 ; We have to assume that errno can be set, so we have to make a libcall in that case. -; But it's better for perf to check that the argument is valid rather than the result of +; But it's better for perf to check that the argument is valid rather than the result of ; sqrtss/sqrtsd. ; Note: This is really a test of the -partially-inline-libcalls IR pass (and we have an IR test ; for that), but we're checking the final asm to make sure that comes out as expected too. @@ -11,11 +11,11 @@ define float @f(float %val) nounwind { ; CHECK-LABEL: f: ; CHECK: # BB#0: -; CHECK-NEXT: sqrtss %xmm0, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm1 -; CHECK-NEXT: jp .LBB0_2 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: jb .LBB0_2 ; CHECK-NEXT: # BB#1: # %.split -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: sqrtss %xmm0, %xmm0 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB0_2: # %call.sqrt ; CHECK-NEXT: jmp sqrtf # TAILCALL @@ -26,11 +26,11 @@ define double @d(double %val) nounwind { ; CHECK-LABEL: d: ; CHECK: # BB#0: -; CHECK-NEXT: sqrtsd %xmm0, %xmm1 -; CHECK-NEXT: ucomisd %xmm1, %xmm1 -; CHECK-NEXT: jp .LBB1_2 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: jb .LBB1_2 ; CHECK-NEXT: # BB#1: # %.split -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: sqrtsd %xmm0, %xmm0 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_2: # %call.sqrt ; CHECK-NEXT: jmp sqrt # TAILCALL Index: llvm/trunk/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll =================================================================== --- llvm/trunk/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll +++ llvm/trunk/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll @@ -1,18 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s ; RUN: opt -S -passes=partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s define float @f(float %val) { -; CHECK: @f -; CHECK: entry: -; CHECK-NEXT: %[[RES:.+]] = tail call float @sqrtf(float %val) #0 -; CHECK-NEXT: %[[CMP:.+]] = fcmp oeq float %[[RES]], %[[RES]] -; CHECK-NEXT: br i1 %[[CMP]], label %[[EXIT:.+]], label %[[CALL:.+]] -; CHECK: [[CALL]]: -; CHECK-NEXT: %[[RES2:.+]] = tail call float @sqrtf(float %val){{$}} -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: %[[RET:.+]] = phi float [ %[[RES]], %entry ], [ %[[RES2]], %[[CALL]] ] -; CHECK-NEXT: ret float %[[RET]] +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = tail call float @sqrtf(float [[VAL:%.*]]) #0 +; CHECK-NEXT: [[TMP0:%.*]] = fcmp oge float [[VAL]], 0.000000e+00 +; CHECK-NEXT: br i1 [[TMP0]], label [[ENTRY_SPLIT:%.*]], label [[CALL_SQRT:%.*]] +; CHECK: call.sqrt: +; CHECK-NEXT: [[TMP1:%.*]] = tail call float @sqrtf(float [[VAL]]) +; CHECK-NEXT: br label [[ENTRY_SPLIT]] +; CHECK: entry.split: +; CHECK-NEXT: [[TMP2:%.*]] = phi float [ [[RES]], [[ENTRY:%.*]] ], [ [[TMP1]], [[CALL_SQRT]] ] +; CHECK-NEXT: ret float [[TMP2]] +; entry: %res = tail call float @sqrtf(float %val) ret float %res