Index: lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCasts.cpp +++ lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -16,6 +16,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" using namespace llvm; using namespace PatternMatch; @@ -637,8 +638,9 @@ return nullptr; } -/// CanEvaluateZExtd - Determine if the specified value can be computed in the -/// specified wider type and produce the same low bits. If not, return false. +/// ComputeEvaluateZExtdCost - Compute the extra cost if the specified value is +/// computed in the specified wider type and produce the same low bits. +/// If such evalutation is not possible, return INT_MAX. /// /// If this function returns true, it can also return a non-zero number of bits /// (in BitsToClear) which indicates that the value it computes is correct for @@ -649,48 +651,55 @@ /// %C = lshr i32 %B, 8 /// %E = zext i32 %C to i64 /// -/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be -/// set to 8 to indicate that the promoted value needs to have bits 24-31 -/// cleared in addition to bits 32-63. Since an 'and' will be generated to -/// clear the top bits anyway, doing this has no extra cost. +/// ComputeEvaluateZExtdCost for the 'lshr' will set BitsToClear 8 to indicate +/// that the promoted value needs to have bits 24-31 cleared in addition to bits +/// 32-63. Since an 'and' will be generated to clear the top bits anyway, doing +/// this has no extra cost. /// /// This function works on both vectors and scalars. -static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, - InstCombiner &IC, Instruction *CxtI) { +static int ComputeEvaluateZExtdCost(Value *V, Type *Ty, unsigned &BitsToClear, + InstCombiner &IC, Instruction *CxtI, + const TargetTransformInfo &TTI) { BitsToClear = 0; if (isa(V)) - return true; + return 0; + + Type *OrigTy = V->getType(); Instruction *I = dyn_cast(V); - if (!I) return false; + if (!I) return INT_MAX; // If the input is a truncate from the destination type, we can trivially // eliminate it. if (isa(I) && I->getOperand(0)->getType() == Ty) - return true; + return 0; // We can't extend or shrink something that has multiple uses: doing so would // require duplicating the instruction in general, which isn't profitable. - if (!I->hasOneUse()) return false; + if (!I->hasOneUse()) return INT_MAX; unsigned Opc = I->getOpcode(), Tmp; switch (Opc) { case Instruction::ZExt: // zext(zext(x)) -> zext(x). case Instruction::SExt: // zext(sext(x)) -> sext(x). case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x) - return true; + return 0; case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::Add: case Instruction::Sub: - case Instruction::Mul: - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) - return false; + case Instruction::Mul: { + int LHSCost = ComputeEvaluateZExtdCost(I->getOperand(0), Ty, BitsToClear, + IC, CxtI, TTI); + if (LHSCost == INT_MAX) return INT_MAX; + int RHSCost = + ComputeEvaluateZExtdCost(I->getOperand(1), Ty, Tmp, IC, CxtI, TTI); + if (RHSCost == INT_MAX) return INT_MAX; // These can all be promoted if neither operand has 'bits to clear'. if (BitsToClear == 0 && Tmp == 0) - return true; + return LHSCost + RHSCost + TTI.getArithmeticInstrCost(Opc, Ty) - + TTI.getArithmeticInstrCost(Opc, OrigTy); // If the operation is an AND/OR/XOR and the bits to clear are zero in the // other side, BitsToClear is ok. @@ -703,63 +712,84 @@ if (IC.MaskedValueIsZero(I->getOperand(1), APInt::getHighBitsSet(VSize, BitsToClear), 0, CxtI)) - return true; + return LHSCost + RHSCost + TTI.getArithmeticInstrCost(Opc, Ty) - + TTI.getArithmeticInstrCost(Opc, OrigTy); } // Otherwise, we don't know how to analyze this BitsToClear case yet. - return false; + return INT_MAX; + } case Instruction::Shl: // We can promote shl(x, cst) if we can promote x. Since shl overwrites the // upper bits we can reduce BitsToClear by the shift amount. if (ConstantInt *Amt = dyn_cast(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) - return false; + int Cost = ComputeEvaluateZExtdCost(I->getOperand(0), Ty, BitsToClear, IC, + CxtI, TTI); + if (Cost == INT_MAX) + return INT_MAX; uint64_t ShiftAmt = Amt->getZExtValue(); BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; - return true; + return Cost + TTI.getArithmeticInstrCost(Opc, Ty) - + TTI.getArithmeticInstrCost(Opc, OrigTy); } - return false; + return INT_MAX; case Instruction::LShr: // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. if (ConstantInt *Amt = dyn_cast(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) - return false; + int Cost = ComputeEvaluateZExtdCost(I->getOperand(0), Ty, BitsToClear, IC, + CxtI, TTI); + if (Cost == INT_MAX) + return INT_MAX; BitsToClear += Amt->getZExtValue(); if (BitsToClear > V->getType()->getScalarSizeInBits()) BitsToClear = V->getType()->getScalarSizeInBits(); - return true; + return Cost + TTI.getArithmeticInstrCost(Opc, Ty) - + TTI.getArithmeticInstrCost(Opc, OrigTy); } // Cannot promote variable LSHR. - return false; - case Instruction::Select: - if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || - // TODO: If important, we could handle the case when the BitsToClear are - // known zero in the disagreeing side. - Tmp != BitsToClear) - return false; - return true; + return INT_MAX; + case Instruction::Select: { + int Cost1 = + ComputeEvaluateZExtdCost(I->getOperand(1), Ty, Tmp, IC, CxtI, TTI); + if (Cost1 == INT_MAX) + return INT_MAX; + int Cost2 = ComputeEvaluateZExtdCost(I->getOperand(2), Ty, BitsToClear, IC, + CxtI, TTI); + // TODO: If important, we could handle the case when the BitsToClear are + // known zero in the disagreeing side. + if (Cost2 == INT_MAX || Tmp != BitsToClear) { + return INT_MAX; + } + return Cost1 + Cost2; + } case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast(I); - if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) - return false; - for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || + int Cost = ComputeEvaluateZExtdCost(PN->getIncomingValue(0), Ty, + BitsToClear, IC, CxtI, TTI); + if (Cost == INT_MAX) + return INT_MAX; + int AccumCost = Cost; + for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) { + Cost = ComputeEvaluateZExtdCost(PN->getIncomingValue(i), Ty, Tmp, IC, + CxtI, TTI); + if (Cost == INT_MAX || // TODO: If important, we could handle the case when the BitsToClear // are known zero in the disagreeing input. Tmp != BitsToClear) - return false; - return true; + return INT_MAX; + AccumCost += Cost; + } + return AccumCost; } default: // TODO: Can handle more cases here. - return false; + return INT_MAX; } } @@ -787,13 +817,13 @@ // strange. unsigned BitsToClear; if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { + ComputeEvaluateZExtdCost(Src, DestTy, BitsToClear, *this, &CI, TTI) <= 0) { assert(BitsToClear < SrcTy->getScalarSizeInBits() && "Unreasonable BitsToClear"); // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid zero extend: " << CI); + " to avoid zero extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, false); assert(Res->getType() == DestTy); @@ -985,69 +1015,87 @@ return nullptr; } -/// CanEvaluateSExtd - Return true if we can take the specified value -/// and return it as type Ty without inserting any new casts and without -/// changing the value of the common low bits. This is used by code that tries -/// to promote integer operations to a wider types will allow us to eliminate -/// the extension. +/// ComputeEvaluateSExtdCost - Return the extra cost if we take the specified +/// value and return it as type Ty without inserting any new casts and without +/// changing the value of the common low bits. If such evalution is not +/// possible, return INT_MAX. +/// +/// This is used by code that tries to promote integer operations to a wider +/// types will allow us to eliminate the extension. /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateSExtd(Value *V, Type *Ty) { +static int ComputeEvaluateSExtdCost(Value *V, Type *Ty, + const TargetTransformInfo &TTI) { assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && "Can't sign extend type to a smaller type"); // If this is a constant, it can be trivially promoted. if (isa(V)) - return true; + return 0; Instruction *I = dyn_cast(V); - if (!I) return false; + if (!I) return INT_MAX; // If this is a truncate from the dest type, we can trivially eliminate it. if (isa(I) && I->getOperand(0)->getType() == Ty) - return true; + return 0; // We can't extend or shrink something that has multiple uses: doing so would // require duplicating the instruction in general, which isn't profitable. - if (!I->hasOneUse()) return false; + if (!I->hasOneUse()) return INT_MAX; - switch (I->getOpcode()) { + unsigned Opc = I->getOpcode(); + switch (Opc) { case Instruction::SExt: // sext(sext(x)) -> sext(x) case Instruction::ZExt: // sext(zext(x)) -> zext(x) case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x) - return true; + return 0; case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::Add: case Instruction::Sub: - case Instruction::Mul: + case Instruction::Mul: { // These operators can all arbitrarily be extended if their inputs can. - return CanEvaluateSExtd(I->getOperand(0), Ty) && - CanEvaluateSExtd(I->getOperand(1), Ty); + int LHSCost = ComputeEvaluateSExtdCost(I->getOperand(0), Ty, TTI); + if (LHSCost == INT_MAX) return INT_MAX; + int RHSCost = ComputeEvaluateSExtdCost(I->getOperand(1), Ty, TTI); + if (RHSCost == INT_MAX) return INT_MAX; + return LHSCost + RHSCost + TTI.getArithmeticInstrCost(Opc, Ty) - + TTI.getArithmeticInstrCost(Opc, I->getType()); + } //case Instruction::Shl: TODO //case Instruction::LShr: TODO - case Instruction::Select: - return CanEvaluateSExtd(I->getOperand(1), Ty) && - CanEvaluateSExtd(I->getOperand(2), Ty); + case Instruction::Select: { + auto TrueCost = ComputeEvaluateSExtdCost(I->getOperand(1), Ty, TTI); + if (TrueCost == INT_MAX) return INT_MAX; + auto FalseCost = ComputeEvaluateSExtdCost(I->getOperand(2), Ty, TTI); + if (FalseCost == INT_MAX) return INT_MAX; + return TrueCost + FalseCost; + } case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast(I); - for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateSExtd(IncValue, Ty)) return false; - return true; + int AccumCost = 0; + for (Value *IncValue : PN->incoming_values()) { + int Cost = ComputeEvaluateSExtdCost(IncValue, Ty, TTI); + if (Cost == INT_MAX) + return INT_MAX; + AccumCost += Cost; + } + return AccumCost; } default: // TODO: Can handle more cases here. break; } - return false; + return INT_MAX; } Instruction *InstCombiner::visitSExt(SExtInst &CI) { @@ -1081,10 +1129,10 @@ // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateSExtd(Src, DestTy)) { + ComputeEvaluateSExtdCost(Src, DestTy, TTI) <= 0) { // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid sign extend: " << CI); + " to avoid sign extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, true); assert(Res->getType() == DestTy); Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -35,6 +35,7 @@ class DataLayout; class DominatorTree; class TargetLibraryInfo; +class TargetTransformInfo; class DbgDeclareInst; class MemIntrinsic; class MemSetInst; @@ -183,6 +184,7 @@ TargetLibraryInfo *TLI; DominatorTree *DT; const DataLayout &DL; + const TargetTransformInfo &TTI; // Optional analyses. When non-null, these can both be used to do better // combining and will be updated to reflect any changes. @@ -193,9 +195,11 @@ public: InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder, bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI, - DominatorTree *DT, const DataLayout &DL, LoopInfo *LI) + DominatorTree *DT, const DataLayout &DL, + const TargetTransformInfo &TTI, LoopInfo *LI) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), - AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {} + AC(AC), TLI(TLI), DT(DT), DL(DL), TTI(TTI), LI(LI), + MadeIRChange(false) {} /// \brief Run the combiner over the entire worklist until it is empty. /// Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -48,6 +48,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -2973,7 +2974,9 @@ static bool combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, AssumptionCache &AC, TargetLibraryInfo &TLI, - DominatorTree &DT, LoopInfo *LI = nullptr) { + DominatorTree &DT, + const TargetTransformInfo &TTI, + LoopInfo *LI = nullptr) { // Minimizing size? bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); auto &DL = F.getParent()->getDataLayout(); @@ -2998,7 +3001,8 @@ if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) Changed = true; - InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, LI); + InstCombiner IC(Worklist, &Builder, MinimizeSize, &AC, &TLI, &DT, DL, TTI, + LI); if (IC.run()) Changed = true; @@ -3014,10 +3018,10 @@ auto &AC = AM->getResult(F); auto &DT = AM->getResult(F); auto &TLI = AM->getResult(F); - + const auto &TTI = AM->getResult(F); auto *LI = AM->getCachedResult(F); - if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI)) + if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, TTI, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3052,6 +3056,7 @@ AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); } @@ -3063,13 +3068,14 @@ // Required analyses. auto &AC = getAnalysis().getAssumptionCache(F); auto &TLI = getAnalysis().getTLI(); + const auto &TTI = getAnalysis().getTTI(F); auto &DT = getAnalysis().getDomTree(); // Optional analyses. auto *LIWP = getAnalysisIfAvailable(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI); + return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, TTI, LI); } char InstructionCombiningPass::ID = 0; @@ -3077,6 +3083,7 @@ "Combine redundant instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) Index: test/Transforms/InstCombine/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/InstCombine/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True Index: test/Transforms/InstCombine/NVPTX/no-widen-expensive.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/NVPTX/no-widen-expensive.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +; For the nvptx64 architecture, the cost of an arithmetic instruction on a +; 64-bit integer is twice as expensive as that on a 32-bit integer, because the +; hardware needs to simulate a 64-bit integer using two 32-bit integers. +; Therefore, in this particular architecture, we should not widen induction +; variables to 64-bit integers even though i64 is a legal type in the 64-bit +; PTX ISA. + +define i64 @test(i64 %A) { +; CHECK-LABEL: @test + %trunc = trunc i64 %A to i32 +; CHECK-NOT: xor i64 + %xor = xor i32 %trunc, 23 +; CHECK-NOT: and i64 + %and = and i32 %xor, 112 +; CHECK: zext i32 + %zext = zext i32 %and to i64 + ret i64 %zext +}