Index: llvm/include/llvm/Analysis/ConstantFolding.h =================================================================== --- llvm/include/llvm/Analysis/ConstantFolding.h +++ llvm/include/llvm/Analysis/ConstantFolding.h @@ -31,6 +31,7 @@ class GlobalValue; class Instruction; class TargetLibraryInfo; +class TargetTransformInfo; class Type; /// If this constant is a constant offset from a global, return the global and @@ -47,7 +48,8 @@ /// this function can only fail when attempting to fold instructions like loads /// and stores, which have no constant expression form. Constant *ConstantFoldInstruction(Instruction *I, const DataLayout &DL, - const TargetLibraryInfo *TLI = nullptr); + const TargetLibraryInfo *TLI = nullptr, + const TargetTransformInfo *TTI = nullptr); /// ConstantFoldConstant - Fold the constant using the specified DataLayout. /// This function always returns a non-null constant: Either the folding result, @@ -63,7 +65,8 @@ /// Constant *ConstantFoldInstOperands(Instruction *I, ArrayRef Ops, const DataLayout &DL, - const TargetLibraryInfo *TLI = nullptr); + const TargetLibraryInfo *TLI = nullptr, + const TargetTransformInfo *TTI = nullptr); /// ConstantFoldCompareInstOperands - Attempt to constant fold a compare /// instruction (icmp/fcmp) with the specified operands. If it fails, it Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -824,6 +824,10 @@ /// Return true if the hardware has a fast square-root instruction. bool haveFastSqrt(Type *Ty) const; + /// Return true if folding a floating-point instruction to a constant + /// should produce zero instead of a denormal + bool enableFPDenormalFlushToZero(const Instruction &Inst) const; + /// Return true if it is faster to check if a floating-point value is NaN /// (or not-NaN) versus a comparison against a constant FP zero value. /// Targets should override this if materializing a 0.0 for comparison is @@ -1597,6 +1601,7 @@ bool *Fast) = 0; virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; virtual bool haveFastSqrt(Type *Ty) = 0; + virtual bool enableFPDenormalFlushToZero(const Instruction &Inst) = 0; virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0; virtual InstructionCost getFPOpCost(Type *Ty) = 0; virtual InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, @@ -2059,6 +2064,10 @@ } bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); } + bool enableFPDenormalFlushToZero(const Instruction &Inst) override { + return Impl.enableFPDenormalFlushToZero(Inst); + } + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override { return Impl.isFCmpOrdCheaperThanFCmpZero(Ty); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -361,6 +361,10 @@ bool haveFastSqrt(Type *Ty) const { return false; } + bool enableFPDenormalFlushToZero(const Instruction &Inst) const { + return false; + } + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; } InstructionCost getFPOpCost(Type *Ty) const { Index: llvm/include/llvm/Transforms/InstCombine/InstCombiner.h =================================================================== --- llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -43,10 +43,6 @@ /// This class provides both the logic to recursively visit instructions and /// combine them. class LLVM_LIBRARY_VISIBILITY InstCombiner { - /// Only used to call target specific intrinsic combining. - /// It must **NOT** be used for any other purpose, as InstCombine is a - /// target-independent canonicalization transform. - TargetTransformInfo &TTI; public: /// Maximum size of array considered when transforming. @@ -69,6 +65,10 @@ // Required analyses. AssumptionCache &AC; TargetLibraryInfo &TLI; + /// Only used to call target specific intrinsic combining. + /// It must **NOT** be used for any other purpose, as InstCombine is a + /// target-independent canonicalization transform. + TargetTransformInfo &TTI; DominatorTree &DT; const DataLayout &DL; const SimplifyQuery SQ; @@ -89,8 +89,8 @@ DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) - : TTI(TTI), Builder(Builder), Worklist(Worklist), - MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), + : Builder(Builder), Worklist(Worklist), + MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), TTI(TTI), DT(DT), DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} virtual ~InstCombiner() {} Index: llvm/lib/Analysis/ConstantFolding.cpp =================================================================== --- llvm/lib/Analysis/ConstantFolding.cpp +++ llvm/lib/Analysis/ConstantFolding.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/Config/config.h" @@ -1006,14 +1007,47 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode, ArrayRef Ops, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI = nullptr) { Type *DestTy = InstOrCE->getType(); - if (Instruction::isUnaryOp(Opcode)) - return ConstantFoldUnaryOpOperand(Opcode, Ops[0], DL); + if (Instruction::isUnaryOp(Opcode)) { + Constant *C = ConstantFoldUnaryOpOperand(Opcode, Ops[0], DL); + if (auto *CFP = dyn_cast(C)) { + /// If folding produces a floating point denormal, check whether + /// it should be forced to zero. + if (CFP->getValueAPF().isDenormal()) { + if (auto *I = dyn_cast(InstOrCE)) { + /// If TTI is not available to determine support for flushing + /// denormals to zero, do not fold the instruction. + if (!TTI) + return nullptr; + if (TTI->enableFPDenormalFlushToZero(*I)) + return Constant::getNullValue(C->getType()); + } + } + } + return C; + } - if (Instruction::isBinaryOp(Opcode)) - return ConstantFoldBinaryOpOperands(Opcode, Ops[0], Ops[1], DL); + if (Instruction::isBinaryOp(Opcode)) { + Constant *C = ConstantFoldBinaryOpOperands(Opcode, Ops[0], Ops[1], DL); + if (auto *CFP = dyn_cast(C)) { + /// If folding produces a floating point denormal, check whether + /// it should be forced to zero. + if (CFP->getValueAPF().isDenormal()) { + if (auto *I = dyn_cast(InstOrCE)) { + /// If TTI is not available to determine support for flushing + /// denormals to zero, do not fold the instruction. + if (!TTI) + return nullptr; + if (TTI->enableFPDenormalFlushToZero(*I)) + return Constant::getNullValue(C->getType()); + } + } + } + return C; + } if (Instruction::isCast(Opcode)) return ConstantFoldCastOperand(Opcode, Ops[0], DestTy, DL); @@ -1106,7 +1140,8 @@ } // end anonymous namespace Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { // Handle PHI nodes quickly here... if (auto *PN = dyn_cast(I)) { Constant *CommonValue = nullptr; @@ -1166,7 +1201,7 @@ if (auto *EVI = dyn_cast(I)) return ConstantExpr::getExtractValue(Ops[0], EVI->getIndices()); - return ConstantFoldInstOperands(I, Ops, DL, TLI); + return ConstantFoldInstOperands(I, Ops, DL, TLI, TTI); } Constant *llvm::ConstantFoldConstant(const Constant *C, const DataLayout &DL, @@ -1178,8 +1213,9 @@ Constant *llvm::ConstantFoldInstOperands(Instruction *I, ArrayRef Ops, const DataLayout &DL, - const TargetLibraryInfo *TLI) { - return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI); + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { + return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI, TTI); } Constant *llvm::ConstantFoldCompareInstOperands(unsigned IntPredicate, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -547,6 +547,11 @@ return TTIImpl->haveFastSqrt(Ty); } +bool TargetTransformInfo::enableFPDenormalFlushToZero( + const Instruction &Inst) const { + return TTIImpl->enableFPDenormalFlushToZero(Inst); +} + bool TargetTransformInfo::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return TTIImpl->isFCmpOrdCheaperThanFCmpZero(Ty); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -67,6 +67,8 @@ bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + bool enableFPDenormalFlushToZero(const Instruction &Inst) const; + /// \name Scalar TTI Implementations /// @{ Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -50,6 +50,25 @@ return (CallerBits & CalleeBits) == CalleeBits; } +bool AArch64TTIImpl::enableFPDenormalFlushToZero( + const Instruction &Inst) const { + if (!Inst.isFast()) + return false; + + switch (Inst.getOpcode()) { + case Instruction::FNeg: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return true; + default: + break; + } + return false; +} + /// Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -106,6 +106,8 @@ bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + bool enableFPDenormalFlushToZero(const Instruction &Inst) const; + bool enableInterleavedAccessVectorization() { return true; } TTI::AddressingModeKind Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -100,6 +100,24 @@ return MatchExact && MatchSubset; } +bool ARMTTIImpl::enableFPDenormalFlushToZero(const Instruction &Inst) const { + if (!Inst.isFast()) + return false; + + switch (Inst.getOpcode()) { + case Instruction::FNeg: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return true; + default: + break; + } + return false; +} + TTI::AddressingModeKind ARMTTIImpl::getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const { Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3992,7 +3992,7 @@ // Instruction isn't dead, see if we can constant propagate it. if (!I->use_empty() && (I->getNumOperands() == 0 || isa(I->getOperand(0)))) { - if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) { + if (Constant *C = ConstantFoldInstruction(I, DL, &TLI, &TTI)) { LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); @@ -4199,6 +4199,7 @@ /// whose condition is a known constant, we only visit the reachable successors. static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, InstructionWorklist &ICWorklist) { bool MadeIRChange = false; SmallPtrSet Visited; @@ -4220,7 +4221,7 @@ // ConstantProp instruction if trivially constant. if (!Inst.use_empty() && (Inst.getNumOperands() == 0 || isa(Inst.getOperand(0)))) - if (Constant *C = ConstantFoldInstruction(&Inst, DL, TLI)) { + if (Constant *C = ConstantFoldInstruction(&Inst, DL, TLI, TTI)) { LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << Inst << '\n'); Inst.replaceAllUsesWith(C); @@ -4366,7 +4367,7 @@ LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); - MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); + MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, &TTI, Worklist); InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT, ORE, BFI, PSI, DL, LI); Index: llvm/test/Transforms/InstCombine/AArch64/constant-fold-fp-denormal.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AArch64/constant-fold-fp-denormal.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64--linux-gnu" + +define float @test_float() { +; CHECK-LABEL: @test_float( +; CHECK-NEXT: ret float 0x3800000000000000 + %mul = fmul float 0x3810000000000000, 5.000000e-01 + ret float %mul +} + +define double @test_double() { +; CHECK-LABEL: @test_double( +; CHECK-NEXT: ret double 0x8000000000000 + %mul = fmul double 0x10000000000000, 5.000000e-01 + ret double %mul +} + +define float @test_float_fast() { +; CHECK-LABEL: @test_float_fast( +; CHECK-NEXT: ret float 0.000000e+00 + %mul = fmul fast float 0x3810000000000000, 5.000000e-01 + ret float %mul +} + +define double @test_double_fast() { +; CHECK-LABEL: @test_double_fast( +; CHECK-NEXT: ret double 0.000000e+00 + %mul = fmul fast double 0x10000000000000, 5.000000e-01 + ret double %mul +} Index: llvm/test/Transforms/InstCombine/ARM/constant-fold-fp-denormal.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/ARM/constant-fold-fp-denormal.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "armv8-arm-none-eabi" + +define float @test_float() { +; CHECK-LABEL: @test_float( +; CHECK-NEXT: ret float 0x3800000000000000 + %mul = fmul float 0x3810000000000000, 5.000000e-01 + ret float %mul +} + +define double @test_double() { +; CHECK-LABEL: @test_double( +; CHECK-NEXT: ret double 0x8000000000000 + %mul = fmul double 0x10000000000000, 5.000000e-01 + ret double %mul +} + +define float @test_float_fast() { +; CHECK-LABEL: @test_float_fast( +; CHECK-NEXT: ret float 0.000000e+00 + %mul = fmul fast float 0x3810000000000000, 5.000000e-01 + ret float %mul +} + +define double @test_double_fast() { +; CHECK-LABEL: @test_double_fast( +; CHECK-NEXT: ret double 0.000000e+00 + %mul = fmul fast double 0x10000000000000, 5.000000e-01 + ret double %mul +}